import java.util.TreeMap;
import java.util.Iterator;
import java.util.Set;
import java.net.URL;
import java.util.StringTokenizer;
/**
* A class to count the number of occurences of HTML tags in an html file.
* This class illustrates the use of a TreeMap.
*/
public class HTMLTagCounter {
// The map to store the list of tags and their number of occurences
// A tree map allows the ordering of the entry. We provide our own ordering
// since we want first the begin tags (such as ) in alphabetical order
// and then the end tags (such as ) also in alphabetical order.
private TreeMap HTMLTagMap;
// is an HTML tag currently read?
private boolean inATag;
// the HTML tag currently extracted
private String currentTag;
/**
* Construct the list of HTML tags and their number of occurences
* in an html file. The file is specified by its url. Examples of url's are
* http://www.seattlecentral.org
* file:///C:/temp/myFile.html (on a windows machine)
* @param url the location of the html file
*/
public HTMLTagCounter(String URLName)
{
// Read the list of tags from the html file
InputURL input = new InputURL(URLName);
HTMLTagMap = new TreeMap(new HTMLTagComparator());
// No tag initially
currentTag = "";
// Read the html file line by line
String line;
while( (line=input.readLine())!=null)
{
// add the tags in the current line to the map
addHTMLTagsIn(line);
}
}
/**
* Add (in alphabetical order) all of the HTML tags in this line to the map
*/
private void addHTMLTagsIn(String line)
{
// Look for < if the start of a tag has not been found yet
if (!inATag)
{
int startTagIndex=line.indexOf("<");
if ( startTagIndex != -1 )
{
line = line.substring(startTagIndex+1);
inATag = true;
}
else
// no tag in this line
return;
}
// At this point, a tag has been found
// The tag is the first word up to the first > (unless we already have
// a tag and are just looking for >).
StringTokenizer st = new StringTokenizer(line);
while ( (currentTag.length()==0 || currentTag.equals("/")) && st.hasMoreTokens())
{
String word = st.nextToken();
int index = 0;
while( index' )
{
currentTag += ""+word.charAt(index);
index++;
}
}
// Look for the end of the tag
int endTagIndex = line.indexOf(">");
if (endTagIndex != -1)
{
// Found the end of the tag
inATag = false;
currentTag = "<" + currentTag + ">";
// Add this tag to the list of tags
addHTMLTag(currentTag);
// Next tag
currentTag = "";
// Move in line past that tag
line = line.substring(endTagIndex+1);
}
else // no end of the tag in this line
return;
// Look for tags in what is left in line
if (line.length()>0)
addHTMLTagsIn(line);
}
/**
* add a tag to the list of tags and update the number of occurences
* of that tag
* @param tag the HTML tag (given as a String)
*/
private void addHTMLTag(String tag)
{
// Make the tag all upper case
tag = tag.toUpperCase();
// Is it already in the map?
if (HTMLTagMap.containsKey(tag))
{
// Add 1 to its number of occurences
Integer i = (Integer)HTMLTagMap.get(tag);
i = new Integer(i.intValue()+1);
HTMLTagMap.put(tag,i);
}
else
// Create a new entry
HTMLTagMap.put(tag,new Integer(1));
}
/**
* Return in a String the list of all of the tags and their occurences
*/
public String toString()
{
// Write the tags with their number of occurences
String output = "";
Set listOfTags = HTMLTagMap.keySet();
Iterator it = listOfTags.iterator();
while(it.hasNext())
{
String tag = (String)it.next();
// if it is a begin tag
if (tag.charAt(1)!='/')
{
// write the tag and its number of occurences
output += entryAsAString(tag)+"\n";
// do the same with its associated end tag (if any)
tag = ""+tag.substring(1);
if (HTMLTagMap.containsKey(tag))
{
output += entryAsAString(tag)+"\n";
}
}
else // it is an end tag
{
// write it only if the associated begin tag has not already been written
if (!HTMLTagMap.containsKey("<"+tag.substring(2)))
output += entryAsAString(tag)+"\n";
}
}
return output;
}
/**
* Return as a String an entry of the map.
* @param tag the key of the entry in the map
*/
private String entryAsAString(String tag)
{
if (HTMLTagMap.containsKey(tag))
{
int i = ((Integer)HTMLTagMap.get(tag)).intValue();
return tag + ": " + i;
}
else
return null;
}
/**
* Return the number of occurences of a tag
* @param tag the HTML tag whose number of occurences is returned
*/
public int numberOfOccurences(String tag)
{
// Format the tag according to what is in the map
tag = tag.trim(); // no leading or trailing whitespaces
tag = tag.toUpperCase();
// Add the delimeters < and > if not present
if (tag.charAt(0)!='<') tag = "<"+tag;
if (tag.charAt(tag.length()-1)!='>') tag = tag+">";
// Find it in the map
if (HTMLTagMap.containsKey(tag))
return ((Integer)HTMLTagMap.get(tag)).intValue();
else
return 0;
}
/**
* To test the class
*/
public static void main(String[] args)
{
uwcse.io.Input input = new uwcse.io.Input();
// Get the url
String urlName = input.readString("URL (e.g. http://www.washington.edu): ");
// Count the tags in this file
HTMLTagCounter c = new HTMLTagCounter(urlName);
System.out.println(c.toString());
// Count tag by tag
String tag;
do
{
tag=input.readString("Tag to count (0 to stop): ");
if (!tag.equals("0"))
System.out.println(tag+": "+c.numberOfOccurences(tag));
}while(!tag.equals("0"));
}
}