INSIDE THE HEATON RESEARCH SPIDER - HTTP Programming Recipes for Java Bots

Java Reference

In-Depth Information

public int read() throws IOException {

int result = super.read();

if (result == 0) {

HTMLTag tag = getTag();

if (tag.getName().equalsIgnoreCase("a")) {

String href = tag.getAttributeValue("href");

handleA(href);

} else if (tag.getName().equalsIgnoreCase("img")) {

String src = tag.getAttributeValue("src");

addURL(src, SpiderReportable.URLType.IMAGE);

} else if (tag.getName().equalsIgnoreCase("style")) {

String src = tag.getAttributeValue("src");

addURL(src, SpiderReportable.URLType.STYLE);

} else if (tag.getName().equalsIgnoreCase("link")) {

String href = tag.getAttributeValue("href");

addURL(href, SpiderReportable.URLType.SCRIPT);

} else if (tag.getName().equalsIgnoreCase("base")) {

String href = tag.getAttributeValue("href");

this.base = new URL(this.base, href);

}

return result;

}

/**

* Read all characters on the page. This will discard

* these characters, but allow the spider to examine the

* tags and find links.

*

* @throws IOException

* I/O error.

*/

public void readAll() throws IOException {

while (read() != -1) {

;

}

/**

* Used internally, to add a URL to the spider's workload.

*

* @param u

* The URL to add.

* @param type

* What type of link this is.

Search WWH ::

Custom Search

Home