Database Reference
In-Depth Information
code/hbase/generate_wiki_links.rb
import
'org.apache.hadoop.hbase.client.HTable'
import
'org.apache.hadoop.hbase.client.Put'
import
'org.apache.hadoop.hbase.client.Scan'
import
'org.apache.hadoop.hbase.util.Bytes'
def
jbytes(
*
args )
return
args.map { |arg| arg.to
_
s.to
_
java
_
bytes }
end
wiki
_
table = HTable.new( @hbase.configuration,
'wiki'
)
links
_
table = HTable.new( @hbase.configuration,
'links'
)
links
_
table.setAutoFlush( false )
scanner = wiki
_
table.getScanner( Scan.new )
{
linkpattern = /\[\[([^\[\]\|\:\#][^\[\]\|:]
*
)(?:\|([^\[\]\|]+))?\]\]/
count = 0
while
(result = scanner.next())
|
title = Bytes.toString( result.getRow() )
text = Bytes.toString( result.getValue(
*
jbytes(
'text'
,
''
)))
if
text
put
_
to = nil
text.scan(linkpattern)
do
|target, label|
}
unless
put
_
to
put
_
to = Put.new(
*
jbytes( title ) )
put
_
to.setWriteToWAL( false )
end
target.strip!
target.capitalize!
label =
''
unless
label
label.strip!
put
_
to.add(
*
jbytes(
"to"
, target, label ) )
put
_
from = Put.new(
*
jbytes( target ) )
put
_
from.add(
*
jbytes(
"from"
, title, label ) )
put
_
from.setWriteToWAL( false )
links
_
table.put( put
_
from )
~
end
links
_
table.put( put
_
to )
if
put
_
to
links
_
table.flushCommits()
end
count += 1
puts
"
#{count}
pages processed (
#{title}
)"
if
count % 500 == 0
end
links
_
table.flushCommits()
exit