mirror of
https://github.com/codelucas/newspaper.git
synced 2025-12-23 05:36:50 +00:00
update jieba to 0.35
This commit is contained in:
parent
94be062981
commit
20ebe1f466
25 changed files with 2 additions and 811227 deletions
|
|
@ -1,378 +0,0 @@
|
|||
from __future__ import with_statement
|
||||
__version__ = '0.31'
|
||||
__license__ = 'MIT'
|
||||
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import finalseg
|
||||
import time
|
||||
import tempfile
|
||||
import marshal
|
||||
from math import log
|
||||
import random
|
||||
import threading
|
||||
from functools import wraps
|
||||
|
||||
DICTIONARY = "dict.txt"
|
||||
DICT_LOCK = threading.RLock()
|
||||
trie = None # to be initialized
|
||||
FREQ = {}
|
||||
min_freq = 0.0
|
||||
total =0.0
|
||||
user_word_tag_tab={}
|
||||
initialized = False
|
||||
|
||||
def gen_trie(f_name):
|
||||
lfreq = {}
|
||||
trie = {}
|
||||
ltotal = 0.0
|
||||
with open(f_name, 'rb') as f:
|
||||
lineno = 0
|
||||
for line in f.read().rstrip().decode('utf-8').split('\n'):
|
||||
lineno += 1
|
||||
try:
|
||||
word,freq,_ = line.split(' ')
|
||||
freq = float(freq)
|
||||
lfreq[word] = freq
|
||||
ltotal+=freq
|
||||
p = trie
|
||||
for c in word:
|
||||
if not c in p:
|
||||
p[c] ={}
|
||||
p = p[c]
|
||||
p['']='' #ending flag
|
||||
except ValueError, e:
|
||||
print >> sys.stderr, f_name, ' at line', lineno, line
|
||||
raise e
|
||||
return trie, lfreq,ltotal
|
||||
|
||||
def initialize(*args):
|
||||
global trie, FREQ, total, min_freq, initialized
|
||||
if len(args)==0:
|
||||
dictionary = DICTIONARY
|
||||
else:
|
||||
dictionary = args[0]
|
||||
with DICT_LOCK:
|
||||
if initialized:
|
||||
return
|
||||
if trie:
|
||||
del trie
|
||||
trie = None
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
|
||||
abs_path = os.path.join(_curpath,dictionary)
|
||||
print >> sys.stderr, "Building Trie..., from " + abs_path
|
||||
t1 = time.time()
|
||||
if abs_path == os.path.join(_curpath,"dict.txt"): #defautl dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
|
||||
else: #customer dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(),"jieba.user."+str(hash(abs_path))+".cache")
|
||||
|
||||
load_from_cache_fail = True
|
||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(abs_path):
|
||||
print >> sys.stderr, "loading model from cache " + cache_file
|
||||
try:
|
||||
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
|
||||
load_from_cache_fail = False
|
||||
except:
|
||||
load_from_cache_fail = True
|
||||
|
||||
if load_from_cache_fail:
|
||||
trie,FREQ,total = gen_trie(abs_path)
|
||||
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
|
||||
min_freq = min(FREQ.itervalues())
|
||||
print >> sys.stderr, "dumping model to file cache " + cache_file
|
||||
try:
|
||||
tmp_suffix = "."+str(random.random())
|
||||
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
||||
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
|
||||
if os.name=='nt':
|
||||
import shutil
|
||||
replace_file = shutil.move
|
||||
else:
|
||||
replace_file = os.rename
|
||||
replace_file(cache_file+tmp_suffix,cache_file)
|
||||
except:
|
||||
print >> sys.stderr, "dump cache file failed."
|
||||
import traceback
|
||||
print >> sys.stderr, traceback.format_exc()
|
||||
|
||||
initialized = True
|
||||
|
||||
print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds."
|
||||
print >> sys.stderr, "Trie has been built succesfully."
|
||||
|
||||
|
||||
def require_initialized(fn):
|
||||
global initialized,DICTIONARY
|
||||
|
||||
@wraps(fn)
|
||||
def wrapped(*args, **kwargs):
|
||||
if initialized:
|
||||
return fn(*args, **kwargs)
|
||||
else:
|
||||
initialize(DICTIONARY)
|
||||
return fn(*args, **kwargs)
|
||||
|
||||
return wrapped
|
||||
|
||||
|
||||
def __cut_all(sentence):
|
||||
dag = get_DAG(sentence)
|
||||
old_j = -1
|
||||
for k,L in dag.iteritems():
|
||||
if len(L)==1 and k>old_j:
|
||||
yield sentence[k:L[0]+1]
|
||||
old_j = L[0]
|
||||
else:
|
||||
for j in L:
|
||||
if j>k:
|
||||
yield sentence[k:j+1]
|
||||
old_j = j
|
||||
|
||||
|
||||
def calc(sentence,DAG,idx,route):
|
||||
N = len(sentence)
|
||||
route[N] = (0.0,'')
|
||||
for idx in xrange(N-1,-1,-1):
|
||||
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
|
||||
route[idx] = max(candidates)
|
||||
|
||||
@require_initialized
|
||||
def get_DAG(sentence):
|
||||
N = len(sentence)
|
||||
i,j=0,0
|
||||
p = trie
|
||||
DAG = {}
|
||||
while i<N:
|
||||
c = sentence[j]
|
||||
if c in p:
|
||||
p = p[c]
|
||||
if '' in p:
|
||||
if not i in DAG:
|
||||
DAG[i]=[]
|
||||
DAG[i].append(j)
|
||||
j+=1
|
||||
if j>=N:
|
||||
i+=1
|
||||
j=i
|
||||
p=trie
|
||||
else:
|
||||
p = trie
|
||||
i+=1
|
||||
j=i
|
||||
for i in xrange(len(sentence)):
|
||||
if not i in DAG:
|
||||
DAG[i] =[i]
|
||||
return DAG
|
||||
|
||||
|
||||
def __cut_DAG(sentence):
|
||||
DAG = get_DAG(sentence)
|
||||
route ={}
|
||||
calc(sentence,DAG,0,route=route)
|
||||
x = 0
|
||||
buf =u''
|
||||
N = len(sentence)
|
||||
while x<N:
|
||||
y = route[x][1]+1
|
||||
l_word = sentence[x:y]
|
||||
if y-x==1:
|
||||
buf+= l_word
|
||||
else:
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield buf
|
||||
buf=u''
|
||||
else:
|
||||
if not (buf in FREQ):
|
||||
regognized = finalseg.cut(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield elem
|
||||
buf=u''
|
||||
yield l_word
|
||||
x =y
|
||||
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield buf
|
||||
else:
|
||||
if not (buf in FREQ):
|
||||
regognized = finalseg.cut(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield elem
|
||||
|
||||
def cut(sentence,cut_all=False):
|
||||
if not isinstance(sentence, unicode):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
|
||||
if cut_all:
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
|
||||
blocks = re_han.split(sentence)
|
||||
cut_block = __cut_DAG
|
||||
if cut_all:
|
||||
cut_block = __cut_all
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
#pprint.pprint(__cut_DAG(blk))
|
||||
for word in cut_block(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if re_skip.match(x):
|
||||
yield x
|
||||
elif not cut_all:
|
||||
for xx in x:
|
||||
yield xx
|
||||
else:
|
||||
yield x
|
||||
|
||||
def cut_for_search(sentence):
|
||||
words = cut(sentence)
|
||||
for w in words:
|
||||
if len(w)>2:
|
||||
for i in xrange(len(w)-1):
|
||||
gram2 = w[i:i+2]
|
||||
if gram2 in FREQ:
|
||||
yield gram2
|
||||
if len(w)>3:
|
||||
for i in xrange(len(w)-2):
|
||||
gram3 = w[i:i+3]
|
||||
if gram3 in FREQ:
|
||||
yield gram3
|
||||
yield w
|
||||
|
||||
@require_initialized
|
||||
def load_userdict(f):
|
||||
global trie,total,FREQ
|
||||
if isinstance(f, (str, unicode)):
|
||||
f = open(f, 'rb')
|
||||
content = f.read().decode('utf-8')
|
||||
line_no = 0
|
||||
for line in content.split("\n"):
|
||||
line_no+=1
|
||||
if line.rstrip()=='': continue
|
||||
tup =line.split(" ")
|
||||
word,freq = tup[0],tup[1]
|
||||
if line_no==1:
|
||||
word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
|
||||
if len(tup)==3:
|
||||
add_word(word, freq, tup[2])
|
||||
else:
|
||||
add_word(word, freq)
|
||||
|
||||
def add_word(word, freq, tag=None):
|
||||
global FREQ, trie, total, user_word_tag_tab
|
||||
freq = float(freq)
|
||||
FREQ[word] = log(freq / total)
|
||||
if tag is not None:
|
||||
user_word_tag_tab[word] = tag.strip()
|
||||
p = trie
|
||||
for c in word:
|
||||
if not c in p:
|
||||
p[c] = {}
|
||||
p = p[c]
|
||||
p[''] = '' # ending flag
|
||||
|
||||
__ref_cut = cut
|
||||
__ref_cut_for_search = cut_for_search
|
||||
|
||||
def __lcut(sentence):
|
||||
return list(__ref_cut(sentence,False))
|
||||
def __lcut_all(sentence):
|
||||
return list(__ref_cut(sentence,True))
|
||||
def __lcut_for_search(sentence):
|
||||
return list(__ref_cut_for_search(sentence))
|
||||
|
||||
|
||||
@require_initialized
|
||||
def enable_parallel(processnum=None):
|
||||
global pool,cut,cut_for_search
|
||||
if os.name=='nt':
|
||||
raise Exception("jieba: parallel mode only supports posix system")
|
||||
if sys.version_info[0]==2 and sys.version_info[1]<6:
|
||||
raise Exception("jieba: the parallel feature needs Python version>2.5 ")
|
||||
from multiprocessing import Pool,cpu_count
|
||||
if processnum==None:
|
||||
processnum = cpu_count()
|
||||
pool = Pool(processnum)
|
||||
|
||||
def pcut(sentence,cut_all=False):
|
||||
parts = re.compile('([\r\n]+)').split(sentence)
|
||||
if cut_all:
|
||||
result = pool.map(__lcut_all,parts)
|
||||
else:
|
||||
result = pool.map(__lcut,parts)
|
||||
for r in result:
|
||||
for w in r:
|
||||
yield w
|
||||
|
||||
def pcut_for_search(sentence):
|
||||
parts = re.compile('([\r\n]+)').split(sentence)
|
||||
result = pool.map(__lcut_for_search,parts)
|
||||
for r in result:
|
||||
for w in r:
|
||||
yield w
|
||||
|
||||
cut = pcut
|
||||
cut_for_search = pcut_for_search
|
||||
|
||||
def disable_parallel():
|
||||
global pool,cut,cut_for_search
|
||||
if 'pool' in globals():
|
||||
pool.close()
|
||||
pool = None
|
||||
cut = __ref_cut
|
||||
cut_for_search = __ref_cut_for_search
|
||||
|
||||
def set_dictionary(dictionary_path):
|
||||
global initialized, DICTIONARY
|
||||
with DICT_LOCK:
|
||||
abs_path = os.path.normpath( os.path.join( os.getcwd(), dictionary_path ) )
|
||||
if not os.path.exists(abs_path):
|
||||
raise Exception("jieba: path does not exists:" + abs_path)
|
||||
DICTIONARY = abs_path
|
||||
initialized = False
|
||||
|
||||
def get_abs_path_dict():
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
abs_path = os.path.join(_curpath,DICTIONARY)
|
||||
return abs_path
|
||||
|
||||
def tokenize(unicode_sentence,mode="default"):
|
||||
#mode ("default" or "search")
|
||||
if not isinstance(unicode_sentence, unicode):
|
||||
raise Exception("jieba: the input parameter should unicode.")
|
||||
start = 0
|
||||
if mode=='default':
|
||||
for w in cut(unicode_sentence):
|
||||
width = len(w)
|
||||
yield (w,start,start+width)
|
||||
start+=width
|
||||
else:
|
||||
for w in cut(unicode_sentence):
|
||||
width = len(w)
|
||||
if len(w)>2:
|
||||
for i in xrange(len(w)-1):
|
||||
gram2 = w[i:i+2]
|
||||
if gram2 in FREQ:
|
||||
yield (gram2,start+i,start+i+2)
|
||||
if len(w)>3:
|
||||
for i in xrange(len(w)-2):
|
||||
gram3 = w[i:i+3]
|
||||
if gram3 in FREQ:
|
||||
yield (gram3,start+i,start+i+3)
|
||||
yield (w,start,start+width)
|
||||
start+=width
|
||||
|
||||
|
|
@ -1,38 +0,0 @@
|
|||
import jieba
|
||||
import os
|
||||
try:
|
||||
from analyzer import ChineseAnalyzer
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
f_name = os.path.join(_curpath,"idf.txt")
|
||||
content = open(f_name,'rb').read().decode('utf-8')
|
||||
|
||||
idf_freq = {}
|
||||
lines = content.split('\n')
|
||||
for line in lines:
|
||||
word,freq = line.split(' ')
|
||||
idf_freq[word] = float(freq)
|
||||
|
||||
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
|
||||
stop_words= set([
|
||||
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
|
||||
])
|
||||
|
||||
def extract_tags(sentence,topK=20):
|
||||
words = jieba.cut(sentence)
|
||||
freq = {}
|
||||
for w in words:
|
||||
if len(w.strip())<2: continue
|
||||
if w.lower() in stop_words: continue
|
||||
freq[w]=freq.get(w,0.0)+1.0
|
||||
total = sum(freq.values())
|
||||
freq = [(k,v/total) for k,v in freq.iteritems()]
|
||||
|
||||
tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
|
||||
st_list = sorted(tf_idf_list,reverse=True)
|
||||
|
||||
top_tuples= st_list[:topK]
|
||||
tags = [a[1] for a in top_tuples]
|
||||
return tags
|
||||
|
|
@ -1,33 +0,0 @@
|
|||
#encoding=utf-8
|
||||
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter
|
||||
from whoosh.analysis import Tokenizer,Token
|
||||
|
||||
import jieba
|
||||
import re
|
||||
|
||||
STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
|
||||
'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
|
||||
'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
|
||||
'to', 'us', 'we', 'when', 'will', 'with', 'yet',
|
||||
'you', 'your',u'的',u'了',u'和'))
|
||||
|
||||
accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
|
||||
|
||||
class ChineseTokenizer(Tokenizer):
|
||||
def __call__(self,text,**kargs):
|
||||
words = jieba.tokenize(text,mode="search")
|
||||
token = Token()
|
||||
for (w,start_pos,stop_pos) in words:
|
||||
if not accepted_chars.match(w):
|
||||
if len(w)>1:
|
||||
pass
|
||||
else:
|
||||
continue
|
||||
token.original = token.text = w
|
||||
token.pos = start_pos
|
||||
token.startchar = start_pos
|
||||
token.endchar = stop_pos
|
||||
yield token
|
||||
|
||||
def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1):
|
||||
return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)
|
||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
Binary file not shown.
|
|
@ -1,105 +0,0 @@
|
|||
from __future__ import with_statement
|
||||
import re
|
||||
import os
|
||||
import marshal
|
||||
import sys
|
||||
|
||||
MIN_FLOAT=-3.14e100
|
||||
|
||||
PROB_START_P = "prob_start.p"
|
||||
PROB_TRANS_P = "prob_trans.p"
|
||||
PROB_EMIT_P = "prob_emit.p"
|
||||
|
||||
|
||||
PrevStatus = {
|
||||
'B':('E','S'),
|
||||
'M':('M','B'),
|
||||
'S':('S','E'),
|
||||
'E':('B','M')
|
||||
}
|
||||
|
||||
def load_model():
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
|
||||
start_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
||||
with open(abs_path, mode='rb') as f:
|
||||
start_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
trans_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
||||
with open(abs_path, 'rb') as f:
|
||||
trans_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
emit_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
||||
with file(abs_path, 'rb') as f:
|
||||
emit_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
return start_p, trans_p, emit_p
|
||||
|
||||
if sys.platform.startswith("java"):
|
||||
start_P, trans_P, emit_P = load_model()
|
||||
else:
|
||||
import prob_start,prob_trans,prob_emit
|
||||
start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P
|
||||
|
||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||
V = [{}] #tabular
|
||||
path = {}
|
||||
for y in states: #init
|
||||
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
|
||||
path[y] = [y]
|
||||
for t in range(1,len(obs)):
|
||||
V.append({})
|
||||
newpath = {}
|
||||
for y in states:
|
||||
em_p = emit_p[y].get(obs[t],MIN_FLOAT)
|
||||
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + em_p ,y0) for y0 in PrevStatus[y] ])
|
||||
V[t][y] =prob
|
||||
newpath[y] = path[state] + [y]
|
||||
path = newpath
|
||||
|
||||
(prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')])
|
||||
|
||||
return (prob, path[state])
|
||||
|
||||
|
||||
def __cut(sentence):
|
||||
global emit_P
|
||||
prob, pos_list = viterbi(sentence,('B','M','E','S'), start_P, trans_P, emit_P)
|
||||
begin, next = 0,0
|
||||
#print pos_list, sentence
|
||||
for i,char in enumerate(sentence):
|
||||
pos = pos_list[i]
|
||||
if pos=='B':
|
||||
begin = i
|
||||
elif pos=='E':
|
||||
yield sentence[begin:i+1]
|
||||
next = i+1
|
||||
elif pos=='S':
|
||||
yield char
|
||||
next = i+1
|
||||
if next<len(sentence):
|
||||
yield sentence[next:]
|
||||
|
||||
def cut(sentence):
|
||||
if not ( type(sentence) is unicode):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
yield x
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load diff
Binary file not shown.
|
|
@ -1,4 +0,0 @@
|
|||
P={'B': -0.26268660809250016,
|
||||
'E': -3.14e+100,
|
||||
'M': -3.14e+100,
|
||||
'S': -1.4652633398537678}
|
||||
Binary file not shown.
|
|
@ -1,4 +0,0 @@
|
|||
P={'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
|
||||
'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
|
||||
'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
|
||||
'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}
|
||||
|
|
@ -1,202 +0,0 @@
|
|||
from __future__ import with_statement
|
||||
import re
|
||||
import os
|
||||
import viterbi
|
||||
import jieba
|
||||
import sys
|
||||
import marshal
|
||||
|
||||
default_encoding = sys.getfilesystemencoding()
|
||||
|
||||
PROB_START_P = "prob_start.p"
|
||||
PROB_TRANS_P = "prob_trans.p"
|
||||
PROB_EMIT_P = "prob_emit.p"
|
||||
CHAR_STATE_TAB_P = "char_state_tab.p"
|
||||
|
||||
def load_model(f_name,isJython=True):
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
|
||||
result = {}
|
||||
with file(f_name, "rb") as f:
|
||||
for line in open(f_name,"rb"):
|
||||
line = line.strip()
|
||||
if line=="":continue
|
||||
word, _, tag = line.split(' ')
|
||||
result[word.decode('utf-8')]=tag
|
||||
f.closed
|
||||
if not isJython:
|
||||
return result
|
||||
|
||||
start_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
||||
with open(abs_path, mode='rb') as f:
|
||||
start_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
trans_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
||||
with open(abs_path, 'rb') as f:
|
||||
trans_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
emit_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
||||
with file(abs_path, 'rb') as f:
|
||||
emit_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
state = {}
|
||||
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
|
||||
with file(abs_path, 'rb') as f:
|
||||
state = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
return state, start_p, trans_p, emit_p, result
|
||||
|
||||
if sys.platform.startswith("java"):
|
||||
char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
|
||||
else:
|
||||
import char_state_tab, prob_start, prob_trans, prob_emit
|
||||
char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P
|
||||
word_tag_tab = load_model(jieba.get_abs_path_dict(),isJython=False)
|
||||
|
||||
if jieba.user_word_tag_tab:
|
||||
word_tag_tab.update(jieba.user_word_tag_tab)
|
||||
|
||||
class pair(object):
|
||||
def __init__(self,word,flag):
|
||||
self.word = word
|
||||
self.flag = flag
|
||||
|
||||
def __unicode__(self):
|
||||
return self.word+u"/"+self.flag
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
def __str__(self):
|
||||
return self.__unicode__().encode(default_encoding)
|
||||
|
||||
def encode(self,arg):
|
||||
return self.__unicode__().encode(arg)
|
||||
|
||||
def __cut(sentence):
|
||||
prob, pos_list = viterbi.viterbi(sentence,char_state_tab_P, start_P, trans_P, emit_P)
|
||||
begin, next = 0,0
|
||||
|
||||
for i,char in enumerate(sentence):
|
||||
pos = pos_list[i][0]
|
||||
if pos=='B':
|
||||
begin = i
|
||||
elif pos=='E':
|
||||
yield pair(sentence[begin:i+1], pos_list[i][1])
|
||||
next = i+1
|
||||
elif pos=='S':
|
||||
yield pair(char,pos_list[i][1])
|
||||
next = i+1
|
||||
if next<len(sentence):
|
||||
yield pair(sentence[next:], pos_list[next][1] )
|
||||
|
||||
def __cut_detail(sentence):
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
|
||||
re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
if re_num.match(x):
|
||||
yield pair(x,'m')
|
||||
elif re_eng.match(x):
|
||||
yield pair(x,'eng')
|
||||
else:
|
||||
yield pair(x,'x')
|
||||
|
||||
def __cut_DAG(sentence):
|
||||
DAG = jieba.get_DAG(sentence)
|
||||
route ={}
|
||||
|
||||
jieba.calc(sentence,DAG,0,route=route)
|
||||
|
||||
x = 0
|
||||
buf =u''
|
||||
N = len(sentence)
|
||||
while x<N:
|
||||
y = route[x][1]+1
|
||||
l_word = sentence[x:y]
|
||||
if y-x==1:
|
||||
buf+= l_word
|
||||
else:
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield pair(buf,word_tag_tab.get(buf,'x'))
|
||||
buf=u''
|
||||
else:
|
||||
if not (buf in jieba.FREQ):
|
||||
regognized = __cut_detail(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield pair(elem,word_tag_tab.get(elem,'x'))
|
||||
buf=u''
|
||||
yield pair(l_word,word_tag_tab.get(l_word,'x'))
|
||||
x =y
|
||||
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield pair(buf,word_tag_tab.get(buf,'x'))
|
||||
else:
|
||||
if not (buf in jieba.FREQ):
|
||||
regognized = __cut_detail(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield pair(elem,word_tag_tab.get(elem,'x'))
|
||||
|
||||
def __cut_internal(sentence):
|
||||
if not ( type(sentence) is unicode):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
|
||||
re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut_DAG(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if re_skip.match(x):
|
||||
yield pair(x,'x')
|
||||
else:
|
||||
for xx in x:
|
||||
if re_num.match(xx):
|
||||
yield pair(xx,'m')
|
||||
elif re_eng.match(x):
|
||||
yield pair(xx,'eng')
|
||||
else:
|
||||
yield pair(xx,'x')
|
||||
|
||||
def __lcut_internal(sentence):
|
||||
return list(__cut_internal(sentence))
|
||||
|
||||
def cut(sentence):
|
||||
if (not hasattr(jieba,'pool')) or (jieba.pool==None):
|
||||
for w in __cut_internal(sentence):
|
||||
yield w
|
||||
else:
|
||||
parts = re.compile('([\r\n]+)').split(sentence)
|
||||
result = jieba.pool.map(__lcut_internal,parts)
|
||||
for r in result:
|
||||
for w in r:
|
||||
yield w
|
||||
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load diff
Binary file not shown.
File diff suppressed because it is too large
Load diff
Binary file not shown.
|
|
@ -1,256 +0,0 @@
|
|||
P={('B', 'a'): -4.762305214596967,
|
||||
('B', 'ad'): -6.680066036784177,
|
||||
('B', 'ag'): -3.14e+100,
|
||||
('B', 'an'): -8.697083223018778,
|
||||
('B', 'b'): -5.018374362109218,
|
||||
('B', 'bg'): -3.14e+100,
|
||||
('B', 'c'): -3.423880184954888,
|
||||
('B', 'd'): -3.9750475297585357,
|
||||
('B', 'df'): -8.888974230828882,
|
||||
('B', 'dg'): -3.14e+100,
|
||||
('B', 'e'): -8.563551830394255,
|
||||
('B', 'en'): -3.14e+100,
|
||||
('B', 'f'): -5.491630418482717,
|
||||
('B', 'g'): -3.14e+100,
|
||||
('B', 'h'): -13.533365129970255,
|
||||
('B', 'i'): -6.1157847275557105,
|
||||
('B', 'in'): -3.14e+100,
|
||||
('B', 'j'): -5.0576191284681915,
|
||||
('B', 'jn'): -3.14e+100,
|
||||
('B', 'k'): -3.14e+100,
|
||||
('B', 'l'): -4.905883584659895,
|
||||
('B', 'ln'): -3.14e+100,
|
||||
('B', 'm'): -3.6524299819046386,
|
||||
('B', 'mg'): -3.14e+100,
|
||||
('B', 'mq'): -6.78695300139688,
|
||||
('B', 'n'): -1.6966257797548328,
|
||||
('B', 'ng'): -3.14e+100,
|
||||
('B', 'nr'): -2.2310495913769506,
|
||||
('B', 'nrfg'): -5.873722175405573,
|
||||
('B', 'nrt'): -4.985642733519195,
|
||||
('B', 'ns'): -2.8228438314969213,
|
||||
('B', 'nt'): -4.846091668182416,
|
||||
('B', 'nz'): -3.94698846057672,
|
||||
('B', 'o'): -8.433498702146057,
|
||||
('B', 'p'): -4.200984132085048,
|
||||
('B', 'q'): -6.998123858956596,
|
||||
('B', 'qe'): -3.14e+100,
|
||||
('B', 'qg'): -3.14e+100,
|
||||
('B', 'r'): -3.4098187790818413,
|
||||
('B', 'rg'): -3.14e+100,
|
||||
('B', 'rr'): -12.434752841302146,
|
||||
('B', 'rz'): -7.946116471570005,
|
||||
('B', 's'): -5.522673590839954,
|
||||
('B', 't'): -3.3647479094528574,
|
||||
('B', 'tg'): -3.14e+100,
|
||||
('B', 'u'): -9.163917277503234,
|
||||
('B', 'ud'): -3.14e+100,
|
||||
('B', 'ug'): -3.14e+100,
|
||||
('B', 'uj'): -3.14e+100,
|
||||
('B', 'ul'): -3.14e+100,
|
||||
('B', 'uv'): -3.14e+100,
|
||||
('B', 'uz'): -3.14e+100,
|
||||
('B', 'v'): -2.6740584874265685,
|
||||
('B', 'vd'): -9.044728760238115,
|
||||
('B', 'vg'): -3.14e+100,
|
||||
('B', 'vi'): -12.434752841302146,
|
||||
('B', 'vn'): -4.3315610890163585,
|
||||
('B', 'vq'): -12.147070768850364,
|
||||
('B', 'w'): -3.14e+100,
|
||||
('B', 'x'): -3.14e+100,
|
||||
('B', 'y'): -9.844485675856319,
|
||||
('B', 'yg'): -3.14e+100,
|
||||
('B', 'z'): -7.045681111485645,
|
||||
('B', 'zg'): -3.14e+100,
|
||||
('E', 'a'): -3.14e+100,
|
||||
('E', 'ad'): -3.14e+100,
|
||||
('E', 'ag'): -3.14e+100,
|
||||
('E', 'an'): -3.14e+100,
|
||||
('E', 'b'): -3.14e+100,
|
||||
('E', 'bg'): -3.14e+100,
|
||||
('E', 'c'): -3.14e+100,
|
||||
('E', 'd'): -3.14e+100,
|
||||
('E', 'df'): -3.14e+100,
|
||||
('E', 'dg'): -3.14e+100,
|
||||
('E', 'e'): -3.14e+100,
|
||||
('E', 'en'): -3.14e+100,
|
||||
('E', 'f'): -3.14e+100,
|
||||
('E', 'g'): -3.14e+100,
|
||||
('E', 'h'): -3.14e+100,
|
||||
('E', 'i'): -3.14e+100,
|
||||
('E', 'in'): -3.14e+100,
|
||||
('E', 'j'): -3.14e+100,
|
||||
('E', 'jn'): -3.14e+100,
|
||||
('E', 'k'): -3.14e+100,
|
||||
('E', 'l'): -3.14e+100,
|
||||
('E', 'ln'): -3.14e+100,
|
||||
('E', 'm'): -3.14e+100,
|
||||
('E', 'mg'): -3.14e+100,
|
||||
('E', 'mq'): -3.14e+100,
|
||||
('E', 'n'): -3.14e+100,
|
||||
('E', 'ng'): -3.14e+100,
|
||||
('E', 'nr'): -3.14e+100,
|
||||
('E', 'nrfg'): -3.14e+100,
|
||||
('E', 'nrt'): -3.14e+100,
|
||||
('E', 'ns'): -3.14e+100,
|
||||
('E', 'nt'): -3.14e+100,
|
||||
('E', 'nz'): -3.14e+100,
|
||||
('E', 'o'): -3.14e+100,
|
||||
('E', 'p'): -3.14e+100,
|
||||
('E', 'q'): -3.14e+100,
|
||||
('E', 'qe'): -3.14e+100,
|
||||
('E', 'qg'): -3.14e+100,
|
||||
('E', 'r'): -3.14e+100,
|
||||
('E', 'rg'): -3.14e+100,
|
||||
('E', 'rr'): -3.14e+100,
|
||||
('E', 'rz'): -3.14e+100,
|
||||
('E', 's'): -3.14e+100,
|
||||
('E', 't'): -3.14e+100,
|
||||
('E', 'tg'): -3.14e+100,
|
||||
('E', 'u'): -3.14e+100,
|
||||
('E', 'ud'): -3.14e+100,
|
||||
('E', 'ug'): -3.14e+100,
|
||||
('E', 'uj'): -3.14e+100,
|
||||
('E', 'ul'): -3.14e+100,
|
||||
('E', 'uv'): -3.14e+100,
|
||||
('E', 'uz'): -3.14e+100,
|
||||
('E', 'v'): -3.14e+100,
|
||||
('E', 'vd'): -3.14e+100,
|
||||
('E', 'vg'): -3.14e+100,
|
||||
('E', 'vi'): -3.14e+100,
|
||||
('E', 'vn'): -3.14e+100,
|
||||
('E', 'vq'): -3.14e+100,
|
||||
('E', 'w'): -3.14e+100,
|
||||
('E', 'x'): -3.14e+100,
|
||||
('E', 'y'): -3.14e+100,
|
||||
('E', 'yg'): -3.14e+100,
|
||||
('E', 'z'): -3.14e+100,
|
||||
('E', 'zg'): -3.14e+100,
|
||||
('M', 'a'): -3.14e+100,
|
||||
('M', 'ad'): -3.14e+100,
|
||||
('M', 'ag'): -3.14e+100,
|
||||
('M', 'an'): -3.14e+100,
|
||||
('M', 'b'): -3.14e+100,
|
||||
('M', 'bg'): -3.14e+100,
|
||||
('M', 'c'): -3.14e+100,
|
||||
('M', 'd'): -3.14e+100,
|
||||
('M', 'df'): -3.14e+100,
|
||||
('M', 'dg'): -3.14e+100,
|
||||
('M', 'e'): -3.14e+100,
|
||||
('M', 'en'): -3.14e+100,
|
||||
('M', 'f'): -3.14e+100,
|
||||
('M', 'g'): -3.14e+100,
|
||||
('M', 'h'): -3.14e+100,
|
||||
('M', 'i'): -3.14e+100,
|
||||
('M', 'in'): -3.14e+100,
|
||||
('M', 'j'): -3.14e+100,
|
||||
('M', 'jn'): -3.14e+100,
|
||||
('M', 'k'): -3.14e+100,
|
||||
('M', 'l'): -3.14e+100,
|
||||
('M', 'ln'): -3.14e+100,
|
||||
('M', 'm'): -3.14e+100,
|
||||
('M', 'mg'): -3.14e+100,
|
||||
('M', 'mq'): -3.14e+100,
|
||||
('M', 'n'): -3.14e+100,
|
||||
('M', 'ng'): -3.14e+100,
|
||||
('M', 'nr'): -3.14e+100,
|
||||
('M', 'nrfg'): -3.14e+100,
|
||||
('M', 'nrt'): -3.14e+100,
|
||||
('M', 'ns'): -3.14e+100,
|
||||
('M', 'nt'): -3.14e+100,
|
||||
('M', 'nz'): -3.14e+100,
|
||||
('M', 'o'): -3.14e+100,
|
||||
('M', 'p'): -3.14e+100,
|
||||
('M', 'q'): -3.14e+100,
|
||||
('M', 'qe'): -3.14e+100,
|
||||
('M', 'qg'): -3.14e+100,
|
||||
('M', 'r'): -3.14e+100,
|
||||
('M', 'rg'): -3.14e+100,
|
||||
('M', 'rr'): -3.14e+100,
|
||||
('M', 'rz'): -3.14e+100,
|
||||
('M', 's'): -3.14e+100,
|
||||
('M', 't'): -3.14e+100,
|
||||
('M', 'tg'): -3.14e+100,
|
||||
('M', 'u'): -3.14e+100,
|
||||
('M', 'ud'): -3.14e+100,
|
||||
('M', 'ug'): -3.14e+100,
|
||||
('M', 'uj'): -3.14e+100,
|
||||
('M', 'ul'): -3.14e+100,
|
||||
('M', 'uv'): -3.14e+100,
|
||||
('M', 'uz'): -3.14e+100,
|
||||
('M', 'v'): -3.14e+100,
|
||||
('M', 'vd'): -3.14e+100,
|
||||
('M', 'vg'): -3.14e+100,
|
||||
('M', 'vi'): -3.14e+100,
|
||||
('M', 'vn'): -3.14e+100,
|
||||
('M', 'vq'): -3.14e+100,
|
||||
('M', 'w'): -3.14e+100,
|
||||
('M', 'x'): -3.14e+100,
|
||||
('M', 'y'): -3.14e+100,
|
||||
('M', 'yg'): -3.14e+100,
|
||||
('M', 'z'): -3.14e+100,
|
||||
('M', 'zg'): -3.14e+100,
|
||||
('S', 'a'): -3.9025396831295227,
|
||||
('S', 'ad'): -11.048458480182255,
|
||||
('S', 'ag'): -6.954113917960154,
|
||||
('S', 'an'): -12.84021794941031,
|
||||
('S', 'b'): -6.472888763970454,
|
||||
('S', 'bg'): -3.14e+100,
|
||||
('S', 'c'): -4.786966795861212,
|
||||
('S', 'd'): -3.903919764181873,
|
||||
('S', 'df'): -3.14e+100,
|
||||
('S', 'dg'): -8.948397651299683,
|
||||
('S', 'e'): -5.942513006281674,
|
||||
('S', 'en'): -3.14e+100,
|
||||
('S', 'f'): -5.194820249981676,
|
||||
('S', 'g'): -6.507826815331734,
|
||||
('S', 'h'): -8.650563207383884,
|
||||
('S', 'i'): -3.14e+100,
|
||||
('S', 'in'): -3.14e+100,
|
||||
('S', 'j'): -4.911992119644354,
|
||||
('S', 'jn'): -3.14e+100,
|
||||
('S', 'k'): -6.940320595827818,
|
||||
('S', 'l'): -3.14e+100,
|
||||
('S', 'ln'): -3.14e+100,
|
||||
('S', 'm'): -3.269200652116097,
|
||||
('S', 'mg'): -10.825314928868044,
|
||||
('S', 'mq'): -3.14e+100,
|
||||
('S', 'n'): -3.8551483897645107,
|
||||
('S', 'ng'): -4.913434861102905,
|
||||
('S', 'nr'): -4.483663103956885,
|
||||
('S', 'nrfg'): -3.14e+100,
|
||||
('S', 'nrt'): -3.14e+100,
|
||||
('S', 'ns'): -3.14e+100,
|
||||
('S', 'nt'): -12.147070768850364,
|
||||
('S', 'nz'): -3.14e+100,
|
||||
('S', 'o'): -8.464460927750023,
|
||||
('S', 'p'): -2.9868401813596317,
|
||||
('S', 'q'): -4.888658618255058,
|
||||
('S', 'qe'): -3.14e+100,
|
||||
('S', 'qg'): -3.14e+100,
|
||||
('S', 'r'): -2.7635336784127853,
|
||||
('S', 'rg'): -10.275268591948773,
|
||||
('S', 'rr'): -3.14e+100,
|
||||
('S', 'rz'): -3.14e+100,
|
||||
('S', 's'): -3.14e+100,
|
||||
('S', 't'): -3.14e+100,
|
||||
('S', 'tg'): -6.272842531880403,
|
||||
('S', 'u'): -6.940320595827818,
|
||||
('S', 'ud'): -7.728230161053767,
|
||||
('S', 'ug'): -7.5394037026636855,
|
||||
('S', 'uj'): -6.85251045118004,
|
||||
('S', 'ul'): -8.4153713175535,
|
||||
('S', 'uv'): -8.15808672228609,
|
||||
('S', 'uz'): -9.299258625372996,
|
||||
('S', 'v'): -3.053292303412302,
|
||||
('S', 'vd'): -3.14e+100,
|
||||
('S', 'vg'): -5.9430181843676895,
|
||||
('S', 'vi'): -3.14e+100,
|
||||
('S', 'vn'): -11.453923588290419,
|
||||
('S', 'vq'): -3.14e+100,
|
||||
('S', 'w'): -3.14e+100,
|
||||
('S', 'x'): -8.427419656069674,
|
||||
('S', 'y'): -6.1970794699489575,
|
||||
('S', 'yg'): -13.533365129970255,
|
||||
('S', 'z'): -3.14e+100,
|
||||
('S', 'zg'): -3.14e+100}
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load diff
|
|
@ -1,43 +0,0 @@
|
|||
import operator
|
||||
MIN_FLOAT=-3.14e100
|
||||
|
||||
def get_top_states(t_state_v,K=4):
|
||||
items = t_state_v.items()
|
||||
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
|
||||
return [x[0] for x in topK]
|
||||
|
||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||
V = [{}] #tabular
|
||||
mem_path = [{}]
|
||||
all_states = trans_p.keys()
|
||||
for y in states.get(obs[0],all_states): #init
|
||||
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
|
||||
mem_path[0][y] = ''
|
||||
for t in range(1,len(obs)):
|
||||
V.append({})
|
||||
mem_path.append({})
|
||||
prev_states = get_top_states(V[t-1])
|
||||
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
|
||||
|
||||
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) )
|
||||
obs_states = states.get(obs[t],all_states)
|
||||
obs_states = set(obs_states) & set(prev_states_expect_next)
|
||||
|
||||
if len(obs_states)==0: obs_states = all_states
|
||||
for y in obs_states:
|
||||
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states])
|
||||
V[t][y] =prob
|
||||
mem_path[t][y] = state
|
||||
|
||||
last = [(V[-1][y], y) for y in mem_path[-1].keys() ]
|
||||
#if len(last)==0:
|
||||
#print obs
|
||||
(prob, state) = max(last)
|
||||
|
||||
route = [None] * len(obs)
|
||||
i = len(obs)-1
|
||||
while i>=0:
|
||||
route[i] = state
|
||||
state = mem_path[i][state]
|
||||
i-=1
|
||||
return (prob, route)
|
||||
|
|
@ -114,7 +114,7 @@ class StopWordsChinese(StopWords):
|
|||
def candidate_words(self, stripped_input):
|
||||
# jieba builds a tree that takes a while. avoid building
|
||||
# this tree if we don't use the chinese language
|
||||
from .packages import jieba
|
||||
import jieba
|
||||
return jieba.cut(stripped_input, cut_all=True)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -6,4 +6,5 @@ lxml==3.3.5
|
|||
nltk==2.0.4
|
||||
requests==2.3.0
|
||||
six==1.7.3
|
||||
jieba==0.35
|
||||
-e git+https://github.com/karls/responses@regex-url-matching#egg=responses
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue