有次,有人把一个C++面试的问题发给我。我一看,现在面试问题越来越刁难了嘛。题目是:有10万个文本文件,输入一个词组,如何迅速找出含这些词的文件。把题发过来后,开始计时,必须60-90分钟之内完成,将答案寄回。
我一看,这不是要构造一个简单的搜索引擎嘛。
时间紧迫,我立刻打开VI,开始迅速TYPE,并进行了测试。然后将代码与测试的截屏在40多分钟的时候EMAIL了回去。下面将题及我的解答贴出,各位遇到类似的问题可以作为参考。
/////////////////////////////////面试问题//////////////////////////////////////////////
/*************
*
* Copyright (C) D. Yue, March 4, 2009
*
*/
#include
#include
#include
#include
#include
#include
#include
#include
/********************************
Problem 1 Time cut-off: 60 - 90 min.
1.Data: a set of 100,000 ASCII files (strings).
2.Each file contains 1 or more words.
3.A query is entered from stdin (may contain multiple words).
4.Code in C++: find files in (1) that partially matche to query input.
5.Assume: the function will be invoked repeatedly.
6.Optimize for time.
********************************************************************************************/
/**************************
Solution to Problem 1
Algorithm & datastructure
*) Assign a unique integer ID to each of the N ASCII strings, so one can retrieve them by their ID, call these IDs StringIDs;
*) Store the strings in some container, which allows easy retrieval by IDs, a hashtable is approriate.
*) Break each string into words (tokenization)
*) In a hash table with the words as keys, store the StringIDs of the strings that contain a key;
*) On user input search the keys in the hashtable, once a match is found, get the StringIDs, then print the corresponding string
*) take care of the requirement that multiple words must all match
******************************/
using namespace std;
using namespace boost;
struct StringWithID {
StringWithID() {}
StringWithID(int i, const string& s): id(i), value(s) {};
int id;
string value;
static int getNextID() { return NextID++; } // used when assigning a new ID to a new string
private:
static int NextID;
};
int StringWithID::NextID =1; //initialize
class Search{
typedef map IdToString; // given an ID find the string
typedef map StringToId; // given a string, find its ID
typedef map > WordToIds; // given a word, find the IDs of the strings that contains the word
public:
//Initialize the StringSet and WordsToIDs map with the ASCII strings
bool initialize( const vector strs) {
using namespace std;
using namespace boost;
for(vector::const_iterator p=strs.begin(); p != strs.end(); p++ ) {
if(str2id.find(*p) != str2id.end()) continue; // string already there
int new_id = StringWithID::getNextID(); // assign this ID to the new guy
id2str[new_id] = StringWithID(new_id, *p); // stored
str2id[*p] = new_id; //reverse lookup stored
tokenizer tok(*p);
for(tokenizer::iterator i=tok.begin(); i!=tok.end();i++){
cout
w2id[*i].push_back(new_id);
}
}
return true;
}
//pattern is a space separated list of words
//we iterate through the w2id map to check if
bool find_match(const string & pattern) {
//first we tokenize the pattern into words
list pats;
tokenizer tok(pattern);
for(tokenizer::iterator i=tok.begin(); i!=tok.end();i++){
pats.push_back(*i);
}
size_t pat_cnt = pats.size();
map > matched_sets;
for(list::iterator pi = pats.begin(); pi != pats.end(); pi++) {
for(WordToIds::iterator witer = w2id.begin(); witer != w2id.end(); witer++) {//iteratate through the words
string word = witer->first;
if(word.find(*pi) != string::npos) { // the word matched the pattern
cout
copy((witer->second).begin(), (witer->second).end(), inserter(matched_sets[*pi], matched_sets[*pi].begin()));
for(set::iterator i = matched_sets[*pi].begin(); i != matched_sets[*pi].end(); i++) {
int str_id = *i;
string str = id2str[str_id].value;
cout
}
break;
}
}
}
//at this point we have sets of IDs for the individual sub input patterns, we must find the ones that in all of the sets (intersection)
set good_ids;
bool first_run = true;
for(map >::iterator i = matched_sets.begin(); i!= matched_sets.end(); i++) {
if(first_run) {
good_ids = i->second;
first_run = false;
continue;
}
set old_good_ids = good_ids;
set & int_set = i->second;
good_ids.clear();
set_intersection(int_set.begin(), int_set.end(), old_good_ids.begin(), old_good_ids.end(), inserter(good_ids, good_ids.begin()));
}
//now print out the strings
if(good_ids.empty() ) {
cout
}else {
for(set::iterator i = good_ids.begin(); i != good_ids.end(); i++) {
int str_id = *i;
string str = id2str[str_id].value;
cout
}
}
return !good_ids.empty();
}
protected:
IdToString id2str;
StringToId str2id;
WordToIds w2id;
};
//test program
int main() {
vector strings;
strings.push_back("angry brad pitt");
strings.push_back("pitt likes jolie");
Search search;
search.initialize(strings);
char buf[255];
while(std::cin.getline(buf, 255)) {
search.find_match(buf);
}
}