Extract Text from HTML using NekoHTML and Dom4j

Consider the link http://www.cdw.com/shop/search/results.aspx?wclss=C3&enkwrd=laptop&searchscope=ALL.
Lets extract the total search result and title of each item to keep the example simple enough. We will be using XPath to find each element in the HTML page.
Here is the code

package com.asc.dyutiman.html;

import java.io.IOException;
import java.util.List;

import org.cyberneko.html.parsers.DOMParser;
import org.dom4j.Node;
import org.dom4j.io.DOMReader;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;

public class Parse {
	
	public static void main(String[] args){
		
		String url = "http://www.cdw.com/shop/search/results.aspx?wclss=C3&enkwrd=laptop&searchscope=ALL";
		try {
			DOMParser parser = new DOMParser();
			parser.parse(url);
			
			Document document = parser.getDocument();
			DOMReader reader = new DOMReader();
			org.dom4j.Document doc = reader.read(document);
			
			Node totalResultNode = doc.selectSingleNode("//SPAN[@id='lblShowingResultsTop']/B[3]");
			
			@SuppressWarnings("unchecked")
			List<Node> itemList =  doc.selectNodes("//DIV[@class = 'searchrow']");

			System.out.println("Showing " + itemList.size() + " out of " + totalResultNode.getText());
			for(Node itemNode : itemList){
				Node itemTitle = itemNode.selectSingleNode("DIV[@class = 'searchrow-description']/A");
				System.out.println(itemTitle.getText());
			}
		} catch (SAXException e) {
			System.out.println(e.getMessage());
		} catch (IOException e) {
			System.out.println(e.getMessage());
		}
	}
}

Remember to use uppercase for any HTML tag.

Advertisements