import java.util.*;
import java.io.*;
import java.net.*;
import javax.swing.text.html.*;
/*
*
* @author Kung-En Lin
* COMS 572 Project
*
*/
class WebSearch {
/*
* Data structure that implement the queue that stores the nodes to expand.
*/
static Vector OPEN;
/*
* Data structure that stores nodes already visited.
*/
static Hashtable CLOSED;
/*
* If true, the output includes a lot more information useful for debugging the code.
*/
static boolean debugging = false;
/*
* This variable limits the number of links considered at each page; Beam search
* includes in OPEN the best beamWidth links (those with highest heuristic) to OPEN.
*/
static int beamWidth = 10;
/*
* Sets the starting page at the current intranet.
*/
static String rootWebSite = null;
/*
* Sets the starting page at the current intranet.
*/
static String[] goalPattern;
/*
* The goal string
*/
static String goalString = null;
/*
* Does this run use thesaurus?
*/
static boolean isThesaurus = true;
/*
* # of nodes we visited
*/
static int nodesVisited = 0;
/*
* We ignore the words which the length < MAXLENGTHOFCHAR
*/
final static int MAXLENGTHOFCHAR = 3;
/*
* # of nodes we created
*/
static int nodesCreated = 0;
/*
* Web Search could be called from the command line;
* This method checks for correct inputs from the command
* line and calls performSearch.
*/
public static void main(String args[]) {
if (args.length < 3) {
System.out.println("Usage: java WebSearch QueryString RootWebSite SearchStrategy [NoThesaurus]");
} else {
if(args.length == 4)
isThesaurus = false;
String searchStrategyName = args[2]; // Read the search strategy to use.
rootWebSite = args[1];
goalString = args[0];
goalPattern = parsePattern(args[0]);
if (searchStrategyName.equalsIgnoreCase("breadth") ||
searchStrategyName.equalsIgnoreCase("depth") ||
searchStrategyName.equalsIgnoreCase("best")) {
performSearch(rootWebSite, searchStrategyName);
} else {
System.out.println("The valid search strategies are:");
System.out.println(" BREADTH DEPTH");
}
}
System.out.println("Created: " + nodesCreated);
System.out.println("Nodes : " + nodesVisited);
Utilities.waitHere("Press ENTER to exit.");
}
public static String[] parsePattern(String pattern) {
Vector temp = new Vector();
StringTokenizer st = new StringTokenizer(pattern);
while(st.hasMoreTokens()) {
String tmp = st.nextToken();
if(tmp.length() >= MAXLENGTHOFCHAR)
temp.add(tmp);
}
if(isThesaurus)
addThesaurus(temp);
String[] res = new String[temp.size()];
int i = 0;
for(Iterator iter = temp.iterator(); iter.hasNext();i++) {
res[i] = (String) iter.next();
System.out.println(res[i]);
}
return res;
}
static void addThesaurus(Vector words) {
Vector newWords = new Vector();
try {
for(Iterator iter = words.iterator(); iter.hasNext();) {
String word = (String) iter.next();
URL url = new URL("http://www.m-w.com/cgi-bin/thesaurus?book=Thesaurus&va=" + word);
BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));
String line;
while ((line = br.readLine()) != null) {
if(line.indexOf("
Synonyms ") != -1) {
int start = line.indexOf("
Synonyms ");
int end = line.indexOf("
", start + + "
Synonyms ".length());
String synonyms = line.substring(start + "
Synonyms ".length(), end);
StringTokenizer st = new StringTokenizer(synonyms," ,");
while(st.hasMoreTokens()) {
String tmp = st.nextToken();
if(tmp.length() >= MAXLENGTHOFCHAR) {
System.out.println(tmp);
newWords.add(tmp);
}
}
}
}
br.close();
}
} catch (Exception e) {
e.printStackTrace();
return;
}
if(newWords.size() !=0) {
words.addAll(newWords);
}
}
/*
* This function performs a search from the start node.
* If the current page is not a gold page, it calls addNewChildrensToOpen iteratively.
*
*/
static void performSearch(String startNode, String searchStrategy) {
/*
* Initialiate OPEN queue and CLOSED hash table.
*/
OPEN = new Vector();
CLOSED = new Hashtable();
OPEN.addElement(new SearchNode(startNode)); // add start node into queue
// We will keep searching nodes, while the queue is not empty
while (!OPEN.isEmpty()) {
SearchNode currentNode = pop(OPEN); // always pick first node in the queue
String currentURL = currentNode.url;
nodesVisited++;
// Go and fetch the contents of this url
String contents = null;
try {
URL url = new URL(currentURL);
BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));
String line;
while ((line = br.readLine()) != null) {
contents += line + "\n";
}
} catch (Exception e) {
e.printStackTrace();
continue;
}
if (isaGoalNode(contents)) {
break;
}
CLOSED.put(currentURL, currentURL); // Remember this node was visited.
// calling addNewChildrenToOPEN for adding children into queue
addNewChildrenToOPEN(currentNode, contents, searchStrategy);
// Provide a status report.
if (debugging) System.out.println("Nodes visited = " + nodesVisited
+ " |OPEN| = " + OPEN.size());
}
}
// This method reads the page's contents and
// collects the 'children' nodes (ie, the hyperlinks on this page).
// The parent node is also passed in so that 'backpointers' can be
// created (in order to later extract solution paths).
static void addNewChildrenToOPEN(SearchNode parent,
String contents, String searchStrategy) {
BufferedReader br = new BufferedReader(new StringReader(contents));
String line = null;
try {
while((line = br.readLine()) != null) {
String upper = line.toUpperCase();
String hyperlink = null;
String hypertext = null;
// Skip some words which are not useful
if(upper.indexOf("JAVASCRIPT") != -1)
continue;
if(upper.indexOf("MAILTO") != -1)
continue;
if(upper.indexOf("PHONEBOOK") != -1)
continue;
if(upper.indexOf("GRADUATE") != -1)
continue;
if(upper.indexOf("NEWS") != -1)
continue;
if(upper.indexOf("FEEDBACK") != -1)
continue;
int startidx = upper.indexOf("", startidx + 9);
if(startidx != -1 && endidx != -1) {
String temp = line.substring(startidx + 9, endidx);
int endlink = temp.indexOf("\"");
if(endlink != -1) {
hyperlink = temp.substring(0, endlink);
hypertext = temp.substring(temp.indexOf(">") + 1, temp.length() - 1);
if(hyperlink.indexOf("http:") == -1) {
if(parent.url.charAt(parent.url.length()-1) == '/')
hyperlink = parent.url + hyperlink;
else
hyperlink = parent.url + "/" + hyperlink;
}
}
}
if(hyperlink != null) {
System.out.println(hyperlink);
if (alreadyInOpen(hyperlink)) {
// If already in OPEN, we'll ignore this hyperlink
// (Be sure to read the "Technical Note" below.)
if (debugging) System.out.println(" - this node is in the OPEN list.");
} else if (CLOSED.containsKey(hyperlink)) {
// If already in CLOSED, we'll also ignore this hyperlink.
if (debugging) System.out.println(" - this node is in the CLOSED list.");
} else {
// Collect the hypertext if this is a previously unvisited node.
// (This is only needed for HEURISTIC SEARCH, but collect in
// all cases for simplicity.)
/*
* DFS search: Always add a new child in the beginning of queue.
*/
if(searchStrategy.equalsIgnoreCase("depth")) {
OPEN.insertElementAt(new SearchNode(hyperlink, parent.url, hypertext), 0);
}
/*
* BFS searc: Always add a new child in the end of queue.
*/
if(searchStrategy.equalsIgnoreCase("breadth")) {
OPEN.addElement(new SearchNode(hyperlink, parent.url, hypertext));
}
if(searchStrategy.equalsIgnoreCase("best")) {
OPEN.addElement(new SearchNode(hyperlink, parent.url, hypertext));
int idx = -1;
double max = -1;
int i = 0;
// Find a max heuristic value in the queue.
for(Iterator iter = OPEN.iterator(); iter.hasNext(); i++) {
SearchNode sn = (SearchNode) iter.next();
if(sn.getHvalue() > max) {
idx = i;
max = sn.getHvalue();
}
}
SearchNode ins = (SearchNode) OPEN.remove(idx);
OPEN.insertElementAt(ins, 0);
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
// A GOAL is a page that contains the goalPattern set above.
static boolean isaGoalNode(String contents) {
String upper1 = contents.toUpperCase();
String upper2 = goalString.toUpperCase();
return (contents != null && upper1.indexOf(upper2) >= 0);
}
// Is this hyperlink already in the OPEN list?
// This isn't a very efficient way to do a lookup,
// but its fast enough for this homework.
// Also, this for-loop structure can be
// be adapted for use when inserting nodes into OPEN
// according to their heuristic score.
static boolean alreadyInOpen(String hyperlink) {
int length = OPEN.size();
for(int i = 0; i < length; i++) {
SearchNode node = (SearchNode)OPEN.elementAt(i);
String oldHyperlink = node.url;
if (hyperlink.equalsIgnoreCase(oldHyperlink)) return true; // Found it.
}
return false; // Not in OPEN.
}
// You can use this to remove the first element from OPEN.
static SearchNode pop(Vector list) {
SearchNode result = (SearchNode)list.firstElement();
list.removeElementAt(0);
return result;
}
}
class SearchNode
{
/*
* The url of a search node
*/
public String url;
/*
* The url of parent's node
*/
public String parentUrl;
/*
* Heuristic value
*/
public double hValue = 0.0;
/*
* the hypertext of node
*/
public String hypertext;
/*
* Constructor for root node
*/
public SearchNode(String url) {
this(url, "", "");
}
/*
* Constructor for non-root node
*/
public SearchNode(String url, String parent, String hypertext) {
this.url = url;
this.parentUrl = parent;
this.hypertext = hypertext;
WebSearch.nodesCreated++;
this.hFunction();
}
/*
* return heuristic value
*/
public double getHvalue() {
return this.hValue;
}
/*
* Heuristic fucntion
*/
public void hFunction() {
int count = 0;
StringTokenizer st = new StringTokenizer(hypertext);
while(st.hasMoreTokens()) {
String token = st.nextToken();
for(int i=0;i