USING A SPIDER - HTTP Programming Recipes for Java Bots

Java Reference

In-Depth Information

* The type of link this URL is.

* @return True if the spider should scan for links on

* this page.

*/

public boolean spiderFoundURL(URL url, URL source,

SpiderReportable.URLType type) {

if ((this.base != null)

&& (!this.base.equalsIgnoreCase(url.getHost()))) {

return false;

}

return true;

}

/**

* Called when the spider is about to process a NON-HTML

* URL.

*

* @param url

* The URL that the spider found.

* @param stream

* An InputStream to read the page contents from.

* @throws IOException

* Thrown if an IO error occurs while processing

* the page.

*/

public void spiderProcessURL(URL url, InputStream stream)

throws IOException {

byte[] buffer = new byte[1024];

int length;

String filename =

URLUtility.convertFilename(this.path, url, true);

try {

OutputStream os = new FileOutputStream(filename);

do {

length = stream.read(buffer);

if (length != -1) {

os.write(buffer, 0, length);

}

} while (length != -1);

os.close();

} catch (FileNotFoundException e) {

Search WWH ::

Custom Search

Home