Databases Reference
In-Depth Information
SELECT
*
FROM
raw_docs
WHERE
doc_id
<>
'doc_id'
;
CREATE
TABLE
stop
(
stop
STRING
);
INSERT
OVERWRITE
TABLE
stop
SELECT
*
FROM
raw_stop
WHERE
stop
<>
'stop'
;
Then we tokenize using an external Python script, which also handles scrubbing the
tokens:
CREATE
TABLE
tokens
(
token
STRING
);
INSERT
OVERWRITE
TABLE
tokens
SELECT
TRANSFORM
(
text
)
USING
'python ./src/scripts/tokenizer.py'
AS
token
FROM
docs
;
Let's take a look at that Python script, too—this is an alternative approach for creating
UDFs:
#!/usr/bin/env python
# encoding: utf-8
import
re
import
sys
pat_l
=
re
.
compile
(
"\w.*"
)
pat_r
=
re
.
compile
(
".*\w"
)
def
tokenize
(
line
):
"""
split a line of text into a stream of tokens,
while scrubbing the tokens
"""
for
token
in
map
(
lambda
t1
:
re
.
search
(
pat_r
,
t1
)
.
group
(),
map
(
lambda
t0
:
re
.
search
(
pat_l
,
t0
)
.
group
(),
line
.
split
(
" "
))):
if
len
(
token
)
>
0
:
yield
token
if
__name__
==
"__main__"
:
for
line
in
sys
.
stdin
:
for
token
in
tokenize
(
line
.
strip
()
.
lower
()):
print
token
Finally, filter with a left join, then group and count:
SELECT
token
,
COUNT
(
*
)
AS
count
FROM
(
SELECT
*
FROM
tokens
LEFT
OUTER
JOIN
stop
ON
(
tokens
.
token
=
stop
.
stop
)
WHERE
stop
IS
NULL
)
t