/////////////////////////////////////////////////////////////////////// // Proj3.C // // CMSC341 Fall 2001 - Project 3 // // Kyu Bae xxx-xx-5868 Section 201 // // kyu_bae@yahoo.com / kbae1@umbc.edu // // Created: 16 Oct 2001 // // Current: 22 Oct 2001 // /////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////// // Objectives: // This project will implements of a search engine using binary search // tree and priority queues. Given a file containing text, such as a // web page, use binary search tree to store all of the uniques words // contained in the file. In addition to storing unique words, store // an integer with each word indicating the number of times that it // occurred in the document. Once a file has been loaded into a binary // tree, program must be able to answer queries about the file. Finally // given a binary tree containing unique words and their occurrence // counts, must be able to determine efficiently the M most frequent // words in the file from which the tree was contructed.This is done // with a priority queue. /////////////////////////////////////////////////////////////////////// #include #include #include #include #include #include "string.H" #include "dsexceptions.h" #include "BinarySearchTree.H" #include "BinaryHeap.H" #include "DataTree.H" #include "DataHeap.H" ////////////////////////////////////////////////////////////////////// // toLowerCase(): // change string of word to all lower case string of word. // param word: string object will be changed to all lower cases // return : string object which is all lower case letters ////////////////////////////////////////////////////////////////////// string toLowerCase(string word); ////////////////////////////////////////////////////////////////////// // main(): // work in command line. read in two command line arguments and and // print error message if not used correctly. // param argc: integer type argc // param argv: character type array // return: integer type object ////////////////////////////////////////////////////////////////////// int main( int argc, char *argv[]) { //checking for command line arguments if(argc != 2) { cerr << "** Invalid number of command line arguments **\n" << "** Usage is Proj3 **\n" << "** Aborting **" << endl; exit(1); } //opening command file . if not open it will exit ifstream Infile(argv[1]); if (!Infile) { cerr << "** cannot open " << argv[1] << " for the command file **" << endl; cerr << "** Program terminating now **" << endl; exit(1); } string tmpStr1, tmpStr2, tmpStr3, cmdStr; int tmpNum1; char buffer[100]; char *token; ///////////////////////////////////////////////////////////////// //build BSTree & Binary Heap //make Item_not_found and initialize it to "notFd" //if not found then will return this string //////////////////////////////////////////////////////////////// DataTree notFnd(string("notFd"), 0); BinarySearchTree bst( notFnd ); BinaryHeap heap; ////////////////////////////////////////////////////////////// //Read in command file: check for command word ////////////////////////////////////////////////////////////// while ( Infile >> cmdStr ) { cout << "command: " << cmdStr << " "; ////////////////////////////////////////////////////////// //command "LOAD": open a file and read in strings of words ///////////////////////////////////////////////////////// if (strcmp(cmdStr.c_str(), "LOAD") == 0) { Infile >> tmpStr1; cout << tmpStr1 << endl; //open a text file to read in ifstream Infile2(tmpStr1.c_str()); if (!Infile2) { cerr << "** cannot open " << tmpStr1 << " for input **" << endl; cerr << "** Program terminating now **" << endl; exit(1); } //emptying the BSTree bst.makeEmpty(); while (Infile2 >> tmpStr2) { tmpStr3 = toLowerCase(tmpStr2); //changing to lower case //insert data members into BSTree here DataTree data(tmpStr3, 1); DataTree tmpData = bst.find(data); //find first to see it is //there or not if (tmpData.getWord() == "notFd") //if not FD insert bst.insert(data); else { //if FD ++Count and insert DataTree tmpData2 = tmpData; //delete old one and insert bst.remove(tmpData); //new node tmpData2.addCount(); bst.insert(tmpData2); } } Infile2.close(); //close textfile } /////////////////////////////////////////////////// //check command word "FREQUENT" /////////////////////////////////////////////////// else if (strcmp(cmdStr.c_str(), "FREQUENT") == 0) { Infile >> tmpNum1; cout << tmpNum1 << endl << endl; //most frequent words /////////////////////////////////////// //building heap here: ////////////////////////////////////// heap = BinaryHeap (tmpNum1); BinarySearchTreeItr itr(bst.first()); //putting itr at root //inserting # of node asked into heap DataTree tmp6; DataHeap data;//change type of obejct //if inseting into full heap try { for(int i = 0; i < tmpNum1 ; i++) { tmp6 = itr.retrieve(); data = DataHeap (tmp6.getCount(), tmp6.getWord()); heap.insert(data); itr.advance(); } } catch (Overflow ex) { cout << "* ERROR *" << endl; cout << "* Trying to insert into Full heap *" << endl; } //////////////////////////////////////////// //checking if there are larger frequency //if finds larger, delete root and insert it //if not finds don't do anything ///////////////////////////////////////////// DataTree tmp7; DataHeap tmp8; while(!itr.isPastEnd()) { tmp7 = itr.retrieve(); //if it is empty throw try { tmp8 = heap.findMin(); } catch (Underflow ef) { cout << "* ERROR *" << endl; cout << "* Heap is empty!! *" << endl; } if (tmp7.getCount() < tmp8.getCount() || tmp7.getCount() == tmp8.getCount()){ itr.advance(); } else { //deleting from the empty heap try { heap.deleteMin(); } catch (Underflow uf) { cout << "* ERROR *" << endl; cout << "* Heap is empty!! *" << endl; } //making data element for heap DataHeap data2(tmp7.getCount(), tmp7.getWord()); //inserting into full heap try { heap.insert(data2); } catch (Overflow ex) { cout << "* ERROR *" << endl; cout << "* Trying to insert into Full heap *" << endl; } itr.advance(); } } ///////////////////////////////////////////////// //print m most frequent words from Heap //////////////////////////////////////////////// //if heap is empty you cannot do that so throw try { for (int c=0 ; c < tmpNum1; c++) { cout << heap.findMin(); heap.deleteMin(); } } catch (Underflow uf) { cout << "* ERROR *" << endl; cout << "* Trying to delete /or findMin from empty *" << endl; } cout << endl; heap.makeEmpty(); //empptying the heap } //////////////////////////////////////////////////////// //check for command word "QUERY": gives occurrence of //word in the BStree. //////////////////////////////////////////////////////// else if (strcmp(cmdStr.c_str(), "QUERY") == 0) { Infile.getline(buffer, 100, '\n'); int totalCount=0; token = strtok(buffer, " \n"); DataTree tmp1(token, 1); DataTree tmp2 = bst.find(tmp1);//getting query word if (tmp2.getWord() == "notFd") //not found add 0 totalCount = totalCount + 0; else totalCount = totalCount + tmp2.getCount();//found add 1 while(token != NULL){ //getting another query word cout << token << " " ; token = strtok(NULL, " \n"); DataTree tmp3(token, 1); DataTree tmp4 = bst.find(tmp3); if (tmp4.getWord() == "notFd") totalCount = totalCount + 0; else totalCount = totalCount + tmp4.getCount(); } cout << endl << endl; //printing total count cout << "Result = " << totalCount << endl << endl; } ///////////////////////////////////////////////////// //Unknow command word then exit ///////////////////////////////////////////////////// else { cout << endl; //if unknow command word then exit cerr << "** Unknown Command Word **\n" << "** Program terminating. **" << endl; exit (1); } } Infile.close(); //close command file return 0; //finish program } ////////////////////////////////////////////////////////////////////// // toLowerCase(): // change string of word to all lower case string of word. // param word: string object will be changed to all lower cases // return : string object which is all lower case letters ////////////////////////////////////////////////////////////////////// string toLowerCase(string word) { for (int i=0; i < word.length();i++) word[i]=tolower(word[i]); return word; } ////////////////////////////////////////////////////////////////////// // The End of Program!!! //////////////////////////////////////////////////////////////////////