C++实现的RMM分词

#include <map>
#include <vector>
#include <list>
#include <string>
#include <algorithm>
#include "tools.h"
using namespace std;


#ifndef PARTH
#define PARTH

#define _ARRAY_COUNT_(x) sizeof(x)/sizeof(*x)
#define _WPRINTF_(c) setlocale(LC_ALL,"chs"); wprintf( c ); cout << endl;
#define _WVECTOR_ vector<wstring>

void CreateVectorForArray(const wstring *arr, const int arr_len, vector<wstring> &vec);
void StrReplace(wstring &str, const wstring find, const wstring replace);

#define _WCreateVectorForArray_(src,target) CreateVectorForArray(src,_ARRAY_COUNT_(src),target);
#define _WInVectory(str, vec) WInVectory(str,vec)

void CreateVectorForArray(const wstring *arr, const int arr_len, vector<wstring> &vec){
	vector<wstring> _tmp( arr, arr+arr_len );
	vec.resize(arr_len);
	copy(_tmp.begin(), _tmp.end(), vec.begin());
}


void StrReplace(wstring &str, const wstring find, const wstring replace){
	string::size_type pos = 0;
	while ( (pos = str.find(find, pos)) != wstring::npos ) {
		str.replace( pos, find.size(), replace );
		pos++;
	}
}

 
typedef struct WordPartResultItem{
	int count;
	int startOffset;
	int endOffset;
	int sort;
	float scale;
	WordPartResultItem():count(0),startOffset(0),endOffset(0),scale(0),sort(0){};
} _WordPartResultItem;



typedef WordPartResultItem _WordPartResult;



class WordPart{
	
public:
	
	WordPart( wstring word[], int word_len);
	
	void SetContent( wstring content);

	//分词时是否允许将每个单字分割词
	bool is_single;

	//执行分词
	map<wstring,_WordPartResult> Part(); 

private:

	//用于分词的词典
	map<int,list<wstring>> words;
	
	//词典总数
	int words_length;

	//将要分词的内容
	wstring content;

	//最大匹配词的长度
	int max_length;

	//忽略的字符
	_WVECTOR_ trim_str;
	
	//断句符号
	_WVECTOR_ end_dot;

	//获取最大匹配词的长度
	int GetMaxLength(wstring *words);

	void setResult( map<wstring,_WordPartResult> &result, int &sort, const wstring k, const int offset );

};

#endif;


WordPart::WordPart( wstring word[], int word_len){
	//this->words = word;
	this->words_length = word_len;
	this->is_single = false;
	max_length = GetMaxLength(word);
	
	wstring trim_tmp[] = {L"\"",L"'",L"‘",L"’",L"“",L"”",L"\\",L"(",L")",L"(",L")"};
	wstring end_tmp[] = {L".",L",",L"。",L",",L":",L":",L" "};
	_WCreateVectorForArray_(trim_tmp,trim_str)
	_WCreateVectorForArray_(end_tmp,end_dot)
	
	//整理词典
	for( int i=0; i<word_len; i++){
		int k = word[i].size();
		if( words.find(k) == words.end() ){
			list<wstring> v;
			words.insert( map<int,list<wstring>>::value_type(k,v) );
		}
		words[k].push_back(word[i]);
	}
	
}

void WordPart::SetContent( wstring content){
	_WVECTOR_::iterator iter;
	for( iter=trim_str.begin(); iter!=trim_str.end(); iter++){
		StrReplace(content,*iter,L" ");
	}
	this->content = content;
}

int WordPart::GetMaxLength(wstring *words){
	
	int size = 0;
	for( int i=0; i<words_length; i++ ){
		if( words[i].length() > size )
			size = words[i].length();
	}

	return size;
	
}


map<wstring,_WordPartResult> WordPart::Part(){

	map<wstring,_WordPartResult> result;
	
	if( content.empty() || words_length == 0 )
		return result;
	
	//如果要求切分单字
	if( is_single ){
		wstring cstr;
		wstring eng;
		int sort = 0;
		for( int i=0; i<content.size(); i++ ){
			cstr = content.substr(i,1);
			if( (cstr < L"a" || cstr > L"z") && (cstr < L"A" || cstr > L"Z") && (cstr < L"0" || cstr > L"9") ){
				if(!eng.empty()){
					if( words.find(eng.size()) == words.end() 
						|| find( words[eng.size()].begin(), words[eng.size()].end(), eng ) == words[eng.size()].end() )
					setResult(result,sort,eng,i-eng.size());
					eng.clear();
				}
				if( words.find(1) == words.end() 
					|| find( words[1].begin(), words[1].end(), cstr ) == words[1].end() )
					setResult(result,sort,cstr,i);

			}else{
				eng.append(cstr);
			}
		}
		if(!eng.empty()){
			if( words.find(eng.size()) == words.end() 
				|| find( words[eng.size()].begin(), words[eng.size()].end(), eng ) == words[eng.size()].end() )
				setResult(result,sort,eng,content.size()-eng.size());
				eng.clear();
		}

	}

	_WVECTOR_ line;
	wstring one_line = L"";

	//段句
	for( int i=0; i<content.size(); i++ ){
		wstring str = content.substr( i, 1 );
		if( find(end_dot.begin(),end_dot.end(),str) == end_dot.end() ){
			one_line.append(str);
		}else{
			line.insert(line.begin(),one_line);
			one_line.clear();
		}
		if( i==content.size() - 1 && !one_line.empty() ){
			line.insert(line.begin(),one_line);
		}
	}
	
	int content_size = content.size(), offset = content_size, sort = 0;

	for(_WVECTOR_::iterator i=line.begin(); i != line.end(); i++){
		offset -= (*(i)).size();
		if( i != line.begin() ){
			offset -= 1;	//断句符号位置
		}
		//分词
		wstring str = *i;			//段内容
		wstring cstr;				//分段内容
		int begin = 0,pointer;		//游标,一个全局游标,和一个相对游标
		int k;						//词典键
		int sublength = max_length;	//每个查询字符的长度
		
		if( str.size() > max_length ){
			begin = str.size() - max_length;
		}else{
			sublength = str.size();
		}
		
		while (begin != -sublength)
		{
			pointer = 0;
			while( pointer < sublength ){
				if( begin >= 0 ){
					cstr = str.substr( begin, sublength-pointer );
					k = cstr.size();
					
					if( words.find(k) != words.end() && find(words[k].begin(),words[k].end(),cstr)!=words[k].end() ){
						setResult(result,sort,cstr,offset+begin);
						break;
					}
				}
				pointer++;
				if( pointer < sublength )
					begin++;
			}
			begin-=sublength;
		}
	}
	return result;
}


void WordPart::setResult( map<wstring,_WordPartResult> &result, int &sort, const wstring k, const int offset ){
	if( result.find(k) == result.end() ){
		_WordPartResultItem row;
		result.insert( map<wstring,_WordPartResult>::value_type(k,row) );
		result[k].startOffset = offset;
		result[k].sort = sort;
		sort++;
	}
	result[k].count += 1;
	result[k].endOffset = offset;
} 

发表评论