import java.util.TreeMap; import java.util.Iterator; import java.util.Set; import java.net.URL; import java.util.StringTokenizer; /** * A class to count the number of occurences of HTML tags in an html file.
* This class illustrates the use of a TreeMap. */ public class HTMLTagCounter { // The map to store the list of tags and their number of occurences // A tree map allows the ordering of the entry. We provide our own ordering // since we want first the begin tags (such as ) in alphabetical order // and then the end tags (such as ) also in alphabetical order. private TreeMap HTMLTagMap; // is an HTML tag currently read? private boolean inATag; // the HTML tag currently extracted private String currentTag; /** * Construct the list of HTML tags and their number of occurences * in an html file. The file is specified by its url. Examples of url's are * http://www.seattlecentral.org * file:///C:/temp/myFile.html (on a windows machine) * @param url the location of the html file */ public HTMLTagCounter(String URLName) { // Read the list of tags from the html file InputURL input = new InputURL(URLName); HTMLTagMap = new TreeMap(new HTMLTagComparator()); // No tag initially currentTag = ""; // Read the html file line by line String line; while( (line=input.readLine())!=null) { // add the tags in the current line to the map addHTMLTagsIn(line); } } /** * Add (in alphabetical order) all of the HTML tags in this line to the map */ private void addHTMLTagsIn(String line) { // Look for < if the start of a tag has not been found yet if (!inATag) { int startTagIndex=line.indexOf("<"); if ( startTagIndex != -1 ) { line = line.substring(startTagIndex+1); inATag = true; } else // no tag in this line return; } // At this point, a tag has been found // The tag is the first word up to the first > (unless we already have // a tag and are just looking for >). StringTokenizer st = new StringTokenizer(line); while ( (currentTag.length()==0 || currentTag.equals("/")) && st.hasMoreTokens()) { String word = st.nextToken(); int index = 0; while( index"); if (endTagIndex != -1) { // Found the end of the tag inATag = false; currentTag = "<" + currentTag + ">"; // Add this tag to the list of tags addHTMLTag(currentTag); // Next tag currentTag = ""; // Move in line past that tag line = line.substring(endTagIndex+1); } else // no end of the tag in this line return; // Look for tags in what is left in line if (line.length()>0) addHTMLTagsIn(line); } /** * add a tag to the list of tags and update the number of occurences * of that tag * @param tag the HTML tag (given as a String) */ private void addHTMLTag(String tag) { // Make the tag all upper case tag = tag.toUpperCase(); // Is it already in the map? if (HTMLTagMap.containsKey(tag)) { // Add 1 to its number of occurences Integer i = (Integer)HTMLTagMap.get(tag); i = new Integer(i.intValue()+1); HTMLTagMap.put(tag,i); } else // Create a new entry HTMLTagMap.put(tag,new Integer(1)); } /** * Return in a String the list of all of the tags and their occurences */ public String toString() { // Write the tags with their number of occurences String output = ""; Set listOfTags = HTMLTagMap.keySet(); Iterator it = listOfTags.iterator(); while(it.hasNext()) { String tag = (String)it.next(); // if it is a begin tag if (tag.charAt(1)!='/') { // write the tag and its number of occurences output += entryAsAString(tag)+"\n"; // do the same with its associated end tag (if any) tag = " if not present if (tag.charAt(0)!='<') tag = "<"+tag; if (tag.charAt(tag.length()-1)!='>') tag = tag+">"; // Find it in the map if (HTMLTagMap.containsKey(tag)) return ((Integer)HTMLTagMap.get(tag)).intValue(); else return 0; } /** * To test the class */ public static void main(String[] args) { uwcse.io.Input input = new uwcse.io.Input(); // Get the url String urlName = input.readString("URL (e.g. http://www.washington.edu): "); // Count the tags in this file HTMLTagCounter c = new HTMLTagCounter(urlName); System.out.println(c.toString()); // Count tag by tag String tag; do { tag=input.readString("Tag to count (0 to stop): "); if (!tag.equals("0")) System.out.println(tag+": "+c.numberOfOccurences(tag)); }while(!tag.equals("0")); } }