Database Reference
In-Depth Information
code/hbase/generate_wiki_links.rb
import 'org.apache.hadoop.hbase.client.HTable'
import 'org.apache.hadoop.hbase.client.Put'
import 'org.apache.hadoop.hbase.client.Scan'
import 'org.apache.hadoop.hbase.util.Bytes'
def jbytes( * args )
return args.map { |arg| arg.to _ s.to _ java _ bytes }
end
wiki _ table = HTable.new( @hbase.configuration, 'wiki' )
links _ table = HTable.new( @hbase.configuration, 'links' )
links _ table.setAutoFlush( false )
scanner = wiki _ table.getScanner( Scan.new )
{
linkpattern = /\[\[([^\[\]\|\:\#][^\[\]\|:] * )(?:\|([^\[\]\|]+))?\]\]/
count = 0
while (result = scanner.next())
|
title = Bytes.toString( result.getRow() )
text = Bytes.toString( result.getValue( * jbytes( 'text' , '' )))
if text
put _ to = nil
text.scan(linkpattern) do |target, label|
}
unless put _ to
put _ to = Put.new( * jbytes( title ) )
put _ to.setWriteToWAL( false )
end
target.strip!
target.capitalize!
label = '' unless label
label.strip!
put _ to.add( * jbytes( "to" , target, label ) )
put _ from = Put.new( * jbytes( target ) )
put _ from.add( * jbytes( "from" , title, label ) )
put _ from.setWriteToWAL( false )
links _ table.put( put _ from )
~
end
links _ table.put( put _ to ) if put _ to
links _ table.flushCommits()
end
count += 1
puts " #{count} pages processed ( #{title} )" if count % 500 == 0
end
links _ table.flushCommits()
exit
 
Search WWH ::




Custom Search