#include <map> #include <vector> #include <list> #include <string> #include <algorithm> #include "tools.h" using namespace std; #ifndef PARTH #define PARTH #define _ARRAY_COUNT_(x) sizeof(x)/sizeof(*x) #define _WPRINTF_(c) setlocale(LC_ALL,"chs"); wprintf( c ); cout << endl; #define _WVECTOR_ vector<wstring> void CreateVectorForArray(const wstring *arr, const int arr_len, vector<wstring> &vec); void StrReplace(wstring &str, const wstring find, const wstring replace); #define _WCreateVectorForArray_(src,target) CreateVectorForArray(src,_ARRAY_COUNT_(src),target); #define _WInVectory(str, vec) WInVectory(str,vec) void CreateVectorForArray(const wstring *arr, const int arr_len, vector<wstring> &vec){ vector<wstring> _tmp( arr, arr+arr_len ); vec.resize(arr_len); copy(_tmp.begin(), _tmp.end(), vec.begin()); } void StrReplace(wstring &str, const wstring find, const wstring replace){ string::size_type pos = 0; while ( (pos = str.find(find, pos)) != wstring::npos ) { str.replace( pos, find.size(), replace ); pos++; } } typedef struct WordPartResultItem{ int count; int startOffset; int endOffset; int sort; float scale; WordPartResultItem():count(0),startOffset(0),endOffset(0),scale(0),sort(0){}; } _WordPartResultItem; typedef WordPartResultItem _WordPartResult; class WordPart{ public: WordPart( wstring word[], int word_len); void SetContent( wstring content); //分词时是否允许将每个单字分割词 bool is_single; //执行分词 map<wstring,_WordPartResult> Part(); private: //用于分词的词典 map<int,list<wstring>> words; //词典总数 int words_length; //将要分词的内容 wstring content; //最大匹配词的长度 int max_length; //忽略的字符 _WVECTOR_ trim_str; //断句符号 _WVECTOR_ end_dot; //获取最大匹配词的长度 int GetMaxLength(wstring *words); void setResult( map<wstring,_WordPartResult> &result, int &sort, const wstring k, const int offset ); }; #endif; WordPart::WordPart( wstring word[], int word_len){ //this->words = word; this->words_length = word_len; this->is_single = false; max_length = GetMaxLength(word); wstring trim_tmp[] = {L"\"",L"'",L"‘",L"’",L"“",L"”",L"\\",L"(",L")",L"(",L")"}; wstring end_tmp[] = {L".",L",",L"。",L",",L":",L":",L" "}; _WCreateVectorForArray_(trim_tmp,trim_str) _WCreateVectorForArray_(end_tmp,end_dot) //整理词典 for( int i=0; i<word_len; i++){ int k = word[i].size(); if( words.find(k) == words.end() ){ list<wstring> v; words.insert( map<int,list<wstring>>::value_type(k,v) ); } words[k].push_back(word[i]); } } void WordPart::SetContent( wstring content){ _WVECTOR_::iterator iter; for( iter=trim_str.begin(); iter!=trim_str.end(); iter++){ StrReplace(content,*iter,L" "); } this->content = content; } int WordPart::GetMaxLength(wstring *words){ int size = 0; for( int i=0; i<words_length; i++ ){ if( words[i].length() > size ) size = words[i].length(); } return size; } map<wstring,_WordPartResult> WordPart::Part(){ map<wstring,_WordPartResult> result; if( content.empty() || words_length == 0 ) return result; //如果要求切分单字 if( is_single ){ wstring cstr; wstring eng; int sort = 0; for( int i=0; i<content.size(); i++ ){ cstr = content.substr(i,1); if( (cstr < L"a" || cstr > L"z") && (cstr < L"A" || cstr > L"Z") && (cstr < L"0" || cstr > L"9") ){ if(!eng.empty()){ if( words.find(eng.size()) == words.end() || find( words[eng.size()].begin(), words[eng.size()].end(), eng ) == words[eng.size()].end() ) setResult(result,sort,eng,i-eng.size()); eng.clear(); } if( words.find(1) == words.end() || find( words[1].begin(), words[1].end(), cstr ) == words[1].end() ) setResult(result,sort,cstr,i); }else{ eng.append(cstr); } } if(!eng.empty()){ if( words.find(eng.size()) == words.end() || find( words[eng.size()].begin(), words[eng.size()].end(), eng ) == words[eng.size()].end() ) setResult(result,sort,eng,content.size()-eng.size()); eng.clear(); } } _WVECTOR_ line; wstring one_line = L""; //段句 for( int i=0; i<content.size(); i++ ){ wstring str = content.substr( i, 1 ); if( find(end_dot.begin(),end_dot.end(),str) == end_dot.end() ){ one_line.append(str); }else{ line.insert(line.begin(),one_line); one_line.clear(); } if( i==content.size() - 1 && !one_line.empty() ){ line.insert(line.begin(),one_line); } } int content_size = content.size(), offset = content_size, sort = 0; for(_WVECTOR_::iterator i=line.begin(); i != line.end(); i++){ offset -= (*(i)).size(); if( i != line.begin() ){ offset -= 1; //断句符号位置 } //分词 wstring str = *i; //段内容 wstring cstr; //分段内容 int begin = 0,pointer; //游标,一个全局游标,和一个相对游标 int k; //词典键 int sublength = max_length; //每个查询字符的长度 if( str.size() > max_length ){ begin = str.size() - max_length; }else{ sublength = str.size(); } while (begin != -sublength) { pointer = 0; while( pointer < sublength ){ if( begin >= 0 ){ cstr = str.substr( begin, sublength-pointer ); k = cstr.size(); if( words.find(k) != words.end() && find(words[k].begin(),words[k].end(),cstr)!=words[k].end() ){ setResult(result,sort,cstr,offset+begin); break; } } pointer++; if( pointer < sublength ) begin++; } begin-=sublength; } } return result; } void WordPart::setResult( map<wstring,_WordPartResult> &result, int &sort, const wstring k, const int offset ){ if( result.find(k) == result.end() ){ _WordPartResultItem row; result.insert( map<wstring,_WordPartResult>::value_type(k,row) ); result[k].startOffset = offset; result[k].sort = sort; sort++; } result[k].count += 1; result[k].endOffset = offset; }