Extending Pipe Assemblies - Enterprise Data Workflows with Cascading

Databases Reference

In-Depth Information

SELECT * FROM raw_docs WHERE doc_id <> 'doc_id' ;

CREATE TABLE stop ( stop STRING );

INSERT OVERWRITE TABLE stop

SELECT * FROM raw_stop WHERE stop <> 'stop' ;

Then we tokenize using an external Python script, which also handles scrubbing the

tokens:

CREATE TABLE tokens ( token STRING );

INSERT OVERWRITE TABLE tokens

SELECT TRANSFORM ( text ) USING 'python ./src/scripts/tokenizer.py' AS token

FROM docs ;

Let's take a look at that Python script, too—this is an alternative approach for creating

UDFs:

#!/usr/bin/env python

# encoding: utf-8

import re

import sys

pat_l = re . compile ( "\w.*" )

pat_r = re . compile ( ".*\w" )

def tokenize ( line ):

"""

split a line of text into a stream of tokens,

while scrubbing the tokens

"""

for token in map ( lambda t1 : re . search ( pat_r , t1 ) . group (),

map ( lambda t0 : re . search ( pat_l , t0 ) . group (),

line . split ( " " ))):

if len ( token ) > 0 :

yield token

if __name__ == "__main__" :

for line in sys . stdin :

for token in tokenize ( line . strip () . lower ()):

print token

Finally, filter with a left join, then group and count:

SELECT token , COUNT ( * ) AS count

FROM (

SELECT

*

FROM tokens LEFT OUTER JOIN stop

ON ( tokens . token = stop . stop )

WHERE stop IS NULL

) t

Search WWH ::

Custom Search

Home