Databases Reference
In-Depth Information
SELECT * FROM raw_docs WHERE doc_id <> 'doc_id' ;
CREATE TABLE stop ( stop STRING );
INSERT OVERWRITE TABLE stop
SELECT * FROM raw_stop WHERE stop <> 'stop' ;
Then we tokenize using an external Python script, which also handles scrubbing the
tokens:
CREATE TABLE tokens ( token STRING );
INSERT OVERWRITE TABLE tokens
SELECT TRANSFORM ( text ) USING 'python ./src/scripts/tokenizer.py' AS token
FROM docs ;
Let's take a look at that Python script, too—this is an alternative approach for creating
UDFs:
#!/usr/bin/env python
# encoding: utf-8
import re
import sys
pat_l = re . compile ( "\w.*" )
pat_r = re . compile ( ".*\w" )
def tokenize ( line ):
"""
split a line of text into a stream of tokens,
while scrubbing the tokens
"""
for token in map ( lambda t1 : re . search ( pat_r , t1 ) . group (),
map ( lambda t0 : re . search ( pat_l , t0 ) . group (),
line . split ( " " ))):
if len ( token ) > 0 :
yield token
if __name__ == "__main__" :
for line in sys . stdin :
for token in tokenize ( line . strip () . lower ()):
print token
Finally, filter with a left join, then group and count:
SELECT token , COUNT ( * ) AS count
FROM (
SELECT
*
FROM tokens LEFT OUTER JOIN stop
ON ( tokens . token = stop . stop )
WHERE stop IS NULL
) t
Search WWH ::




Custom Search