import java.util.*; import java.io.*; import java.net.*; import javax.swing.text.html.*; /* * * @author Kung-En Lin * COMS 572 Project * */ class WebSearch { /* * Data structure that implement the queue that stores the nodes to expand. */ static Vector OPEN; /* * Data structure that stores nodes already visited. */ static Hashtable CLOSED; /* * If true, the output includes a lot more information useful for debugging the code. */ static boolean debugging = false; /* * This variable limits the number of links considered at each page; Beam search * includes in OPEN the best beamWidth links (those with highest heuristic) to OPEN. */ static int beamWidth = 10; /* * Sets the starting page at the current intranet. */ static String rootWebSite = null; /* * Sets the starting page at the current intranet. */ static String[] goalPattern; /* * The goal string */ static String goalString = null; /* * Does this run use thesaurus? */ static boolean isThesaurus = true; /* * # of nodes we visited */ static int nodesVisited = 0; /* * We ignore the words which the length < MAXLENGTHOFCHAR */ final static int MAXLENGTHOFCHAR = 3; /* * # of nodes we created */ static int nodesCreated = 0; /* * Web Search could be called from the command line; * This method checks for correct inputs from the command * line and calls performSearch. */ public static void main(String args[]) { if (args.length < 3) { System.out.println("Usage: java WebSearch QueryString RootWebSite SearchStrategy [NoThesaurus]"); } else { if(args.length == 4) isThesaurus = false; String searchStrategyName = args[2]; // Read the search strategy to use. rootWebSite = args[1]; goalString = args[0]; goalPattern = parsePattern(args[0]); if (searchStrategyName.equalsIgnoreCase("breadth") || searchStrategyName.equalsIgnoreCase("depth") || searchStrategyName.equalsIgnoreCase("best")) { performSearch(rootWebSite, searchStrategyName); } else { System.out.println("The valid search strategies are:"); System.out.println(" BREADTH DEPTH"); } } System.out.println("Created: " + nodesCreated); System.out.println("Nodes : " + nodesVisited); Utilities.waitHere("Press ENTER to exit."); } public static String[] parsePattern(String pattern) { Vector temp = new Vector(); StringTokenizer st = new StringTokenizer(pattern); while(st.hasMoreTokens()) { String tmp = st.nextToken(); if(tmp.length() >= MAXLENGTHOFCHAR) temp.add(tmp); } if(isThesaurus) addThesaurus(temp); String[] res = new String[temp.size()]; int i = 0; for(Iterator iter = temp.iterator(); iter.hasNext();i++) { res[i] = (String) iter.next(); System.out.println(res[i]); } return res; } static void addThesaurus(Vector words) { Vector newWords = new Vector(); try { for(Iterator iter = words.iterator(); iter.hasNext();) { String word = (String) iter.next(); URL url = new URL("http://www.m-w.com/cgi-bin/thesaurus?book=Thesaurus&va=" + word); BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream())); String line; while ((line = br.readLine()) != null) { if(line.indexOf("
Synonyms ") != -1) { int start = line.indexOf("
Synonyms "); int end = line.indexOf("
", start + + "
Synonyms ".length()); String synonyms = line.substring(start + "
Synonyms ".length(), end); StringTokenizer st = new StringTokenizer(synonyms," ,"); while(st.hasMoreTokens()) { String tmp = st.nextToken(); if(tmp.length() >= MAXLENGTHOFCHAR) { System.out.println(tmp); newWords.add(tmp); } } } } br.close(); } } catch (Exception e) { e.printStackTrace(); return; } if(newWords.size() !=0) { words.addAll(newWords); } } /* * This function performs a search from the start node. * If the current page is not a gold page, it calls addNewChildrensToOpen iteratively. * */ static void performSearch(String startNode, String searchStrategy) { /* * Initialiate OPEN queue and CLOSED hash table. */ OPEN = new Vector(); CLOSED = new Hashtable(); OPEN.addElement(new SearchNode(startNode)); // add start node into queue // We will keep searching nodes, while the queue is not empty while (!OPEN.isEmpty()) { SearchNode currentNode = pop(OPEN); // always pick first node in the queue String currentURL = currentNode.url; nodesVisited++; // Go and fetch the contents of this url String contents = null; try { URL url = new URL(currentURL); BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream())); String line; while ((line = br.readLine()) != null) { contents += line + "\n"; } } catch (Exception e) { e.printStackTrace(); continue; } if (isaGoalNode(contents)) { break; } CLOSED.put(currentURL, currentURL); // Remember this node was visited. // calling addNewChildrenToOPEN for adding children into queue addNewChildrenToOPEN(currentNode, contents, searchStrategy); // Provide a status report. if (debugging) System.out.println("Nodes visited = " + nodesVisited + " |OPEN| = " + OPEN.size()); } } // This method reads the page's contents and // collects the 'children' nodes (ie, the hyperlinks on this page). // The parent node is also passed in so that 'backpointers' can be // created (in order to later extract solution paths). static void addNewChildrenToOPEN(SearchNode parent, String contents, String searchStrategy) { BufferedReader br = new BufferedReader(new StringReader(contents)); String line = null; try { while((line = br.readLine()) != null) { String upper = line.toUpperCase(); String hyperlink = null; String hypertext = null; // Skip some words which are not useful if(upper.indexOf("JAVASCRIPT") != -1) continue; if(upper.indexOf("MAILTO") != -1) continue; if(upper.indexOf("PHONEBOOK") != -1) continue; if(upper.indexOf("GRADUATE") != -1) continue; if(upper.indexOf("NEWS") != -1) continue; if(upper.indexOf("FEEDBACK") != -1) continue; int startidx = upper.indexOf("", startidx + 9); if(startidx != -1 && endidx != -1) { String temp = line.substring(startidx + 9, endidx); int endlink = temp.indexOf("\""); if(endlink != -1) { hyperlink = temp.substring(0, endlink); hypertext = temp.substring(temp.indexOf(">") + 1, temp.length() - 1); if(hyperlink.indexOf("http:") == -1) { if(parent.url.charAt(parent.url.length()-1) == '/') hyperlink = parent.url + hyperlink; else hyperlink = parent.url + "/" + hyperlink; } } } if(hyperlink != null) { System.out.println(hyperlink); if (alreadyInOpen(hyperlink)) { // If already in OPEN, we'll ignore this hyperlink // (Be sure to read the "Technical Note" below.) if (debugging) System.out.println(" - this node is in the OPEN list."); } else if (CLOSED.containsKey(hyperlink)) { // If already in CLOSED, we'll also ignore this hyperlink. if (debugging) System.out.println(" - this node is in the CLOSED list."); } else { // Collect the hypertext if this is a previously unvisited node. // (This is only needed for HEURISTIC SEARCH, but collect in // all cases for simplicity.) /* * DFS search: Always add a new child in the beginning of queue. */ if(searchStrategy.equalsIgnoreCase("depth")) { OPEN.insertElementAt(new SearchNode(hyperlink, parent.url, hypertext), 0); } /* * BFS searc: Always add a new child in the end of queue. */ if(searchStrategy.equalsIgnoreCase("breadth")) { OPEN.addElement(new SearchNode(hyperlink, parent.url, hypertext)); } if(searchStrategy.equalsIgnoreCase("best")) { OPEN.addElement(new SearchNode(hyperlink, parent.url, hypertext)); int idx = -1; double max = -1; int i = 0; // Find a max heuristic value in the queue. for(Iterator iter = OPEN.iterator(); iter.hasNext(); i++) { SearchNode sn = (SearchNode) iter.next(); if(sn.getHvalue() > max) { idx = i; max = sn.getHvalue(); } } SearchNode ins = (SearchNode) OPEN.remove(idx); OPEN.insertElementAt(ins, 0); } } } } } catch (Exception e) { e.printStackTrace(); } } // A GOAL is a page that contains the goalPattern set above. static boolean isaGoalNode(String contents) { String upper1 = contents.toUpperCase(); String upper2 = goalString.toUpperCase(); return (contents != null && upper1.indexOf(upper2) >= 0); } // Is this hyperlink already in the OPEN list? // This isn't a very efficient way to do a lookup, // but its fast enough for this homework. // Also, this for-loop structure can be // be adapted for use when inserting nodes into OPEN // according to their heuristic score. static boolean alreadyInOpen(String hyperlink) { int length = OPEN.size(); for(int i = 0; i < length; i++) { SearchNode node = (SearchNode)OPEN.elementAt(i); String oldHyperlink = node.url; if (hyperlink.equalsIgnoreCase(oldHyperlink)) return true; // Found it. } return false; // Not in OPEN. } // You can use this to remove the first element from OPEN. static SearchNode pop(Vector list) { SearchNode result = (SearchNode)list.firstElement(); list.removeElementAt(0); return result; } } class SearchNode { /* * The url of a search node */ public String url; /* * The url of parent's node */ public String parentUrl; /* * Heuristic value */ public double hValue = 0.0; /* * the hypertext of node */ public String hypertext; /* * Constructor for root node */ public SearchNode(String url) { this(url, "", ""); } /* * Constructor for non-root node */ public SearchNode(String url, String parent, String hypertext) { this.url = url; this.parentUrl = parent; this.hypertext = hypertext; WebSearch.nodesCreated++; this.hFunction(); } /* * return heuristic value */ public double getHvalue() { return this.hValue; } /* * Heuristic fucntion */ public void hFunction() { int count = 0; StringTokenizer st = new StringTokenizer(hypertext); while(st.hasMoreTokens()) { String token = st.nextToken(); for(int i=0;i