update jieba to 0.35

2025-12-23 05:36:50 +00:00 · 2014-12-16 00:22:52 +08:00 · 2014-12-16 00:22:52 +08:00 · 20ebe1f466
commit 20ebe1f466
parent 94be062981
25 changed files with 2 additions and 811227 deletions
--- a/newspaper/packages/jieba/init.py
+++ b/newspaper/packages/jieba/init.py
@ -1,378 +0,0 @@
-from __future__ import with_statement
-__version__ = '0.31'
-__license__ = 'MIT'
-
-import re
-import os
-import sys
-import finalseg
-import time
-import tempfile
-import marshal
-from math import log
-import random
-import threading
-from functools import wraps
-
-DICTIONARY = "dict.txt"
-DICT_LOCK = threading.RLock()
-trie = None # to be initialized
-FREQ = {}
-min_freq = 0.0
-total =0.0
-user_word_tag_tab={}
-initialized = False
-
-def gen_trie(f_name):
-    lfreq = {}
-    trie = {}
-    ltotal = 0.0
-    with open(f_name, 'rb') as f:
-        lineno = 0 
-        for line in f.read().rstrip().decode('utf-8').split('\n'):
-            lineno += 1
-            try:
-                word,freq,_ = line.split(' ')
-                freq = float(freq)
-                lfreq[word] = freq
-                ltotal+=freq
-                p = trie
-                for c in word:
-                    if not c in p:
-                        p[c] ={}
-                    p = p[c]
-                p['']='' #ending flag
-            except ValueError, e:
-                print >> sys.stderr, f_name, ' at line', lineno, line
-                raise e
-    return trie, lfreq,ltotal
-
-def initialize(*args):
-    global trie, FREQ, total, min_freq, initialized
-    if len(args)==0:
-        dictionary = DICTIONARY
-    else:
-        dictionary = args[0]
-    with DICT_LOCK:
-        if initialized:
-            return
-        if trie:
-            del trie
-            trie = None
-        _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
-
-        abs_path = os.path.join(_curpath,dictionary)
-        print >> sys.stderr, "Building Trie..., from " + abs_path
-        t1 = time.time()
-        if abs_path == os.path.join(_curpath,"dict.txt"): #defautl dictionary
-            cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
-        else: #customer dictionary
-            cache_file = os.path.join(tempfile.gettempdir(),"jieba.user."+str(hash(abs_path))+".cache")
-
-        load_from_cache_fail = True
-        if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(abs_path):
-            print >> sys.stderr, "loading model from cache " + cache_file
-            try:
-                trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
-                load_from_cache_fail = False
-            except:
-                load_from_cache_fail = True
-
-        if load_from_cache_fail:
-            trie,FREQ,total = gen_trie(abs_path)
-            FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
-            min_freq = min(FREQ.itervalues())
-            print >> sys.stderr, "dumping model to file cache " + cache_file
-            try:
-                tmp_suffix = "."+str(random.random())
-                with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
-                    marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
-                if os.name=='nt':
-                    import shutil
-                    replace_file = shutil.move
-                else:
-                    replace_file = os.rename
-                replace_file(cache_file+tmp_suffix,cache_file)
-            except:
-                print >> sys.stderr, "dump cache file failed."
-                import traceback
-                print >> sys.stderr, traceback.format_exc()
-
-        initialized = True
-
-        print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds."
-        print >> sys.stderr, "Trie has been built succesfully."
-
-
-def require_initialized(fn):
-    global initialized,DICTIONARY
-
-    @wraps(fn)
-    def wrapped(*args, **kwargs):
-        if initialized:
-            return fn(*args, **kwargs)
-        else:
-            initialize(DICTIONARY)
-            return fn(*args, **kwargs)
-
-    return wrapped
-
-
-def __cut_all(sentence):
-    dag = get_DAG(sentence)
-    old_j = -1
-    for k,L in dag.iteritems():
-        if len(L)==1 and k>old_j:
-            yield sentence[k:L[0]+1]
-            old_j = L[0] 
-        else:
-            for j in L:
-                if j>k:
-                    yield sentence[k:j+1]
-                    old_j = j
-
-
-def calc(sentence,DAG,idx,route):
-    N = len(sentence)
-    route[N] = (0.0,'')
-    for idx in xrange(N-1,-1,-1):
-        candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
-        route[idx] = max(candidates)
-
-@require_initialized
-def get_DAG(sentence):
-    N = len(sentence)
-    i,j=0,0
-    p = trie
-    DAG = {}
-    while i<N:
-        c = sentence[j]
-        if c in p:
-            p = p[c]
-            if '' in p:
-                if not i in DAG:
-                    DAG[i]=[]
-                DAG[i].append(j)
-            j+=1
-            if j>=N:
-                i+=1
-                j=i
-                p=trie
-        else:
-            p = trie
-            i+=1
-            j=i
-    for i in xrange(len(sentence)):
-        if not i in DAG:
-            DAG[i] =[i]
-    return DAG
-
-
-def __cut_DAG(sentence):
-    DAG = get_DAG(sentence)
-    route ={}
-    calc(sentence,DAG,0,route=route)
-    x = 0
-    buf =u''
-    N = len(sentence)
-    while x<N:
-        y = route[x][1]+1
-        l_word = sentence[x:y]
-        if y-x==1:
-            buf+= l_word
-        else:
-            if len(buf)>0:
-                if len(buf)==1:
-                    yield buf
-                    buf=u''
-                else:
-                    if not (buf in FREQ):
-                        regognized = finalseg.cut(buf)
-                        for t in regognized:
-                            yield t
-                    else:
-                        for elem in buf:
-                            yield elem
-                    buf=u''
-            yield l_word        
-        x =y
-
-    if len(buf)>0:
-        if len(buf)==1:
-            yield buf
-        else:
-            if not (buf in FREQ):
-                regognized = finalseg.cut(buf)
-                for t in regognized:
-                    yield t
-            else:
-                for elem in buf:
-                    yield elem
-
-def cut(sentence,cut_all=False):
-    if not isinstance(sentence, unicode):
-        try:
-            sentence = sentence.decode('utf-8')
-        except UnicodeDecodeError:
-            sentence = sentence.decode('gbk','ignore')
-    re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
-    if cut_all:
-        re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
-    blocks = re_han.split(sentence)
-    cut_block = __cut_DAG
-    if cut_all:
-        cut_block = __cut_all
-    for blk in blocks:
-        if re_han.match(blk):
-            #pprint.pprint(__cut_DAG(blk))
-            for word in cut_block(blk):
-                yield word
-        else:
-            tmp = re_skip.split(blk)
-            for x in tmp:
-                if re_skip.match(x):
-                    yield x
-                elif not cut_all:
-                    for xx in x:
-                        yield xx
-                else:
-                    yield x
-
-def cut_for_search(sentence):
-    words = cut(sentence)
-    for w in words:
-        if len(w)>2:
-            for i in xrange(len(w)-1):
-                gram2 = w[i:i+2]
-                if gram2 in FREQ:
-                    yield gram2
-        if len(w)>3:
-            for i in xrange(len(w)-2):
-                gram3 = w[i:i+3]
-                if gram3 in FREQ:
-                    yield gram3
-        yield w
-
-@require_initialized
-def load_userdict(f):
-    global trie,total,FREQ
-    if isinstance(f, (str, unicode)):
-        f = open(f, 'rb')
-    content = f.read().decode('utf-8')
-    line_no = 0
-    for line in content.split("\n"):
-        line_no+=1
-        if line.rstrip()=='': continue
-        tup =line.split(" ")
-        word,freq = tup[0],tup[1]
-        if line_no==1:
-            word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
-        if len(tup)==3:
-            add_word(word, freq, tup[2])
-        else:
-            add_word(word, freq)
-
-def add_word(word, freq, tag=None):
-    global FREQ, trie, total, user_word_tag_tab
-    freq = float(freq)
-    FREQ[word] = log(freq / total)
-    if tag is not None:
-        user_word_tag_tab[word] = tag.strip()
-    p = trie
-    for c in word:
-        if not c in p:
-            p[c] = {}
-        p = p[c]
-    p[''] = ''                  # ending flag
-
-__ref_cut = cut
-__ref_cut_for_search = cut_for_search
-
-def __lcut(sentence):
-    return list(__ref_cut(sentence,False))
-def __lcut_all(sentence):
-    return list(__ref_cut(sentence,True))
-def __lcut_for_search(sentence):
-    return list(__ref_cut_for_search(sentence))
-
-
-@require_initialized
-def enable_parallel(processnum=None):
-    global pool,cut,cut_for_search
-    if os.name=='nt':
-        raise Exception("jieba: parallel mode only supports posix system")
-    if sys.version_info[0]==2 and sys.version_info[1]<6:
-        raise Exception("jieba: the parallel feature needs Python version>2.5 ")
-    from multiprocessing import Pool,cpu_count
-    if processnum==None:
-        processnum = cpu_count()
-    pool = Pool(processnum)
-
-    def pcut(sentence,cut_all=False):
-        parts = re.compile('([\r\n]+)').split(sentence)
-        if cut_all:
-            result = pool.map(__lcut_all,parts) 
-        else:
-            result = pool.map(__lcut,parts)
-        for r in result:
-            for w in r:
-                yield w
-
-    def pcut_for_search(sentence):
-        parts = re.compile('([\r\n]+)').split(sentence)
-        result = pool.map(__lcut_for_search,parts)
-        for r in result:
-            for w in r:
-                yield w
-
-    cut = pcut
-    cut_for_search = pcut_for_search
-
-def disable_parallel():
-    global pool,cut,cut_for_search
-    if 'pool' in globals():
-        pool.close()
-        pool = None
-    cut = __ref_cut
-    cut_for_search = __ref_cut_for_search
-
-def set_dictionary(dictionary_path):
-    global initialized, DICTIONARY
-    with DICT_LOCK:
-        abs_path = os.path.normpath( os.path.join( os.getcwd(), dictionary_path )  )
-        if not os.path.exists(abs_path):
-            raise Exception("jieba: path does not exists:" + abs_path)
-        DICTIONARY = abs_path
-        initialized = False
-
-def get_abs_path_dict():
-    _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
-    abs_path = os.path.join(_curpath,DICTIONARY)
-    return abs_path
-
-def tokenize(unicode_sentence,mode="default"):
-    #mode ("default" or "search")
-    if not isinstance(unicode_sentence, unicode):
-        raise Exception("jieba: the input parameter should  unicode.")
-    start = 0 
-    if mode=='default':
-        for w in cut(unicode_sentence):
-            width = len(w)
-            yield (w,start,start+width)
-            start+=width
-    else:
-        for w in cut(unicode_sentence):
-            width = len(w)
-            if len(w)>2:
-                for i in xrange(len(w)-1):
-                    gram2 = w[i:i+2]
-                    if gram2 in FREQ:
-                        yield (gram2,start+i,start+i+2)
-            if len(w)>3:
-                for i in xrange(len(w)-2):
-                    gram3 = w[i:i+3]
-                    if gram3 in FREQ:
-                        yield (gram3,start+i,start+i+3)
-            yield (w,start,start+width)
-            start+=width
-
--- a/newspaper/packages/jieba/analyse/init.py
+++ b/newspaper/packages/jieba/analyse/init.py
@ -1,38 +0,0 @@
-import jieba
-import os
-try:
-	from analyzer import ChineseAnalyzer
-except ImportError:
-	pass
-
-_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
-f_name = os.path.join(_curpath,"idf.txt")
-content = open(f_name,'rb').read().decode('utf-8')
-
-idf_freq = {}
-lines = content.split('\n')
-for line in lines:
-    word,freq = line.split(' ')
-    idf_freq[word] = float(freq)
-
-median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
-stop_words= set([
-"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
-])
-
-def extract_tags(sentence,topK=20):
-    words = jieba.cut(sentence)
-    freq = {}
-    for w in words:
-        if len(w.strip())<2: continue
-        if w.lower() in stop_words: continue
-        freq[w]=freq.get(w,0.0)+1.0
-    total = sum(freq.values())
-    freq = [(k,v/total) for k,v in freq.iteritems()]
-
-    tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
-    st_list = sorted(tf_idf_list,reverse=True)
-
-    top_tuples= st_list[:topK]
-    tags = [a[1] for a in top_tuples]
-    return tags
--- a/newspaper/packages/jieba/analyse/analyzer.py
+++ b/newspaper/packages/jieba/analyse/analyzer.py
@ -1,33 +0,0 @@
-#encoding=utf-8
-from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter
-from whoosh.analysis import Tokenizer,Token 
-
-import jieba
-import re
-
-STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
-                        'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
-                        'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
-                        'to', 'us', 'we', 'when', 'will', 'with', 'yet',
-                        'you', 'your',u'的',u'了',u'和'))
-
-accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
-
-class ChineseTokenizer(Tokenizer):
-    def __call__(self,text,**kargs):
-        words = jieba.tokenize(text,mode="search")
-        token  = Token()
-        for (w,start_pos,stop_pos) in words:
-            if not accepted_chars.match(w):
-                if len(w)>1:
-                    pass
-                else:
-                    continue
-            token.original = token.text = w
-            token.pos = start_pos
-            token.startchar = start_pos
-            token.endchar = stop_pos
-            yield token
-
-def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1):
-    return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)
--- a/newspaper/packages/jieba/analyse/idf.txt
+++ b/newspaper/packages/jieba/analyse/idf.txt
--- a/newspaper/packages/jieba/dict.txt
+++ b/newspaper/packages/jieba/dict.txt
--- a/newspaper/packages/jieba/finalseg/init$py.class
+++ b/newspaper/packages/jieba/finalseg/init$py.class
--- a/newspaper/packages/jieba/finalseg/init.py
+++ b/newspaper/packages/jieba/finalseg/init.py
@ -1,105 +0,0 @@
-from __future__ import with_statement
-import re
-import os
-import marshal
-import sys
-
-MIN_FLOAT=-3.14e100
-
-PROB_START_P = "prob_start.p"
-PROB_TRANS_P = "prob_trans.p"
-PROB_EMIT_P = "prob_emit.p"
-
-
-PrevStatus = {
-    'B':('E','S'),
-    'M':('M','B'),
-    'S':('S','E'),
-    'E':('B','M')
-}
-
-def load_model():
-    _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
-
-    start_p = {}
-    abs_path = os.path.join(_curpath, PROB_START_P)
-    with open(abs_path, mode='rb') as f:
-        start_p = marshal.load(f)
-    f.closed
-    
-    trans_p = {}
-    abs_path = os.path.join(_curpath, PROB_TRANS_P)
-    with open(abs_path, 'rb') as f:
-        trans_p = marshal.load(f)
-    f.closed
-    
-    emit_p = {}
-    abs_path = os.path.join(_curpath, PROB_EMIT_P)
-    with file(abs_path, 'rb') as f:
-        emit_p = marshal.load(f)
-    f.closed
-
-    return start_p, trans_p, emit_p
-
-if sys.platform.startswith("java"):
-    start_P, trans_P, emit_P = load_model()    
-else:
-    import prob_start,prob_trans,prob_emit
-    start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P
-
-def viterbi(obs, states, start_p, trans_p, emit_p):
-    V = [{}] #tabular
-    path = {}
-    for y in states: #init
-        V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
-        path[y] = [y]
-    for t in range(1,len(obs)):
-        V.append({})
-        newpath = {}
-        for y in states:
-            em_p = emit_p[y].get(obs[t],MIN_FLOAT)
-            (prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + em_p ,y0) for y0 in PrevStatus[y] ])
-            V[t][y] =prob
-            newpath[y] = path[state] + [y]
-        path = newpath
-    
-    (prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')])
-    
-    return (prob, path[state])
-
-
-def __cut(sentence):
-    global emit_P
-    prob, pos_list =  viterbi(sentence,('B','M','E','S'), start_P, trans_P, emit_P)
-    begin, next = 0,0
-    #print pos_list, sentence
-    for i,char in enumerate(sentence):
-        pos = pos_list[i]
-        if pos=='B':
-            begin = i
-        elif pos=='E':
-            yield sentence[begin:i+1]
-            next = i+1
-        elif pos=='S':
-            yield char
-            next = i+1
-    if next<len(sentence):
-        yield sentence[next:]
-
-def cut(sentence):
-    if not ( type(sentence) is unicode):
-        try:
-            sentence = sentence.decode('utf-8')
-        except:
-            sentence = sentence.decode('gbk','ignore')
-    re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
-    blocks = re_han.split(sentence)
-    for blk in blocks:
-        if re_han.match(blk):
-            for word in __cut(blk):
-                yield word
-        else:
-            tmp = re_skip.split(blk)
-            for x in tmp:
-                if x!="":
-                    yield x
--- a/newspaper/packages/jieba/finalseg/prob_emit.p
+++ b/newspaper/packages/jieba/finalseg/prob_emit.p
--- a/newspaper/packages/jieba/finalseg/prob_emit.py
+++ b/newspaper/packages/jieba/finalseg/prob_emit.py
--- a/newspaper/packages/jieba/finalseg/prob_start.p
+++ b/newspaper/packages/jieba/finalseg/prob_start.p
--- a/newspaper/packages/jieba/finalseg/prob_start.py
+++ b/newspaper/packages/jieba/finalseg/prob_start.py
@ -1,4 +0,0 @@
-P={'B': -0.26268660809250016,
- 'E': -3.14e+100,
- 'M': -3.14e+100,
- 'S': -1.4652633398537678}
--- a/newspaper/packages/jieba/finalseg/prob_trans.p
+++ b/newspaper/packages/jieba/finalseg/prob_trans.p
--- a/newspaper/packages/jieba/finalseg/prob_trans.py
+++ b/newspaper/packages/jieba/finalseg/prob_trans.py
@ -1,4 +0,0 @@
-P={'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
- 'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
- 'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
- 'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}
--- a/newspaper/packages/jieba/posseg/init.py
+++ b/newspaper/packages/jieba/posseg/init.py
@ -1,202 +0,0 @@
-from __future__ import with_statement
-import re
-import os
-import viterbi
-import jieba
-import sys
-import marshal
-
-default_encoding = sys.getfilesystemencoding()
-
-PROB_START_P = "prob_start.p"
-PROB_TRANS_P = "prob_trans.p"
-PROB_EMIT_P = "prob_emit.p"
-CHAR_STATE_TAB_P = "char_state_tab.p"
-
-def load_model(f_name,isJython=True):
-    _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
-
-    result = {}
-    with file(f_name, "rb") as f:
-        for line in open(f_name,"rb"):
-            line = line.strip()
-            if line=="":continue
-            word, _, tag = line.split(' ')
-            result[word.decode('utf-8')]=tag
-    f.closed
-    if not isJython:
-        return result
-    
-    start_p = {}
-    abs_path = os.path.join(_curpath, PROB_START_P)
-    with open(abs_path, mode='rb') as f:
-        start_p = marshal.load(f)
-    f.closed
-    
-    trans_p = {}
-    abs_path = os.path.join(_curpath, PROB_TRANS_P)
-    with open(abs_path, 'rb') as f:
-        trans_p = marshal.load(f)
-    f.closed
-    
-    emit_p = {}
-    abs_path = os.path.join(_curpath, PROB_EMIT_P)
-    with file(abs_path, 'rb') as f:
-        emit_p = marshal.load(f)
-    f.closed
-
-    state = {}
-    abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
-    with file(abs_path, 'rb') as f:
-        state = marshal.load(f)
-    f.closed
-
-    return state, start_p, trans_p, emit_p, result
-
-if sys.platform.startswith("java"):
-    char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
-else:
-    import char_state_tab, prob_start, prob_trans, prob_emit
-    char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P
-    word_tag_tab = load_model(jieba.get_abs_path_dict(),isJython=False)
-
-if jieba.user_word_tag_tab:
-    word_tag_tab.update(jieba.user_word_tag_tab)
-
-class pair(object):
-    def __init__(self,word,flag):
-        self.word = word
-        self.flag = flag
-
-    def __unicode__(self):
-        return self.word+u"/"+self.flag
-
-    def __repr__(self):
-        return self.__str__()
-
-    def __str__(self):
-        return self.__unicode__().encode(default_encoding)
-
-    def encode(self,arg):
-        return self.__unicode__().encode(arg)
-
-def __cut(sentence):
-    prob, pos_list =  viterbi.viterbi(sentence,char_state_tab_P, start_P, trans_P, emit_P)
-    begin, next = 0,0
-
-    for i,char in enumerate(sentence):
-        pos = pos_list[i][0]
-        if pos=='B':
-            begin = i
-        elif pos=='E':
-            yield pair(sentence[begin:i+1], pos_list[i][1])
-            next = i+1
-        elif pos=='S':
-            yield pair(char,pos_list[i][1])
-            next = i+1
-    if next<len(sentence):
-        yield pair(sentence[next:], pos_list[next][1] )
-
-def __cut_detail(sentence):
-    re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
-    re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
-    blocks = re_han.split(sentence)
-    for blk in blocks:
-        if re_han.match(blk):
-                for word in __cut(blk):
-                    yield word
-        else:
-            tmp = re_skip.split(blk)
-            for x in tmp:
-                if x!="":
-                    if re_num.match(x):
-                        yield pair(x,'m')
-                    elif re_eng.match(x):
-                        yield pair(x,'eng')
-                    else:
-                        yield pair(x,'x')
-
-def __cut_DAG(sentence):
-    DAG = jieba.get_DAG(sentence)
-    route ={}
-    
-    jieba.calc(sentence,DAG,0,route=route)
-
-    x = 0
-    buf =u''
-    N = len(sentence)
-    while x<N:
-        y = route[x][1]+1
-        l_word = sentence[x:y]
-        if y-x==1:
-            buf+= l_word
-        else:
-            if len(buf)>0:
-                if len(buf)==1:
-                    yield pair(buf,word_tag_tab.get(buf,'x'))
-                    buf=u''
-                else:
-                    if not (buf in jieba.FREQ):
-                        regognized = __cut_detail(buf)
-                        for t in regognized:
-                            yield t
-                    else:
-                        for elem in buf:
-                            yield pair(elem,word_tag_tab.get(elem,'x'))
-                    buf=u''
-            yield pair(l_word,word_tag_tab.get(l_word,'x'))
-        x =y
-
-    if len(buf)>0:
-        if len(buf)==1:
-            yield pair(buf,word_tag_tab.get(buf,'x'))
-        else:
-            if not (buf in jieba.FREQ):
-                regognized = __cut_detail(buf)
-                for t in regognized:
-                    yield t
-            else:
-                for elem in buf:
-                    yield pair(elem,word_tag_tab.get(elem,'x'))
-
-def __cut_internal(sentence):
-    if not ( type(sentence) is unicode):
-        try:
-            sentence = sentence.decode('utf-8')
-        except:
-            sentence = sentence.decode('gbk','ignore')
-    re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
-    re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
-    blocks = re_han.split(sentence)
-    for blk in blocks:
-        if re_han.match(blk):
-                for word in __cut_DAG(blk):
-                    yield word
-        else:
-            tmp = re_skip.split(blk)
-            for x in tmp:
-                if re_skip.match(x):
-                    yield pair(x,'x')
-                else:
-                    for xx in x:
-                        if re_num.match(xx):
-                            yield pair(xx,'m')
-                        elif re_eng.match(x):
-                            yield pair(xx,'eng')
-                        else:
-                            yield pair(xx,'x')
-
-def __lcut_internal(sentence):
-    return list(__cut_internal(sentence))
-
-def cut(sentence):
-    if (not hasattr(jieba,'pool')) or (jieba.pool==None):
-        for w in __cut_internal(sentence):
-            yield w
-    else:
-        parts = re.compile('([\r\n]+)').split(sentence)
-        result = jieba.pool.map(__lcut_internal,parts)    
-        for r in result:
-            for w in r:
-                yield w
-
--- a/newspaper/packages/jieba/posseg/char_state_tab.p
+++ b/newspaper/packages/jieba/posseg/char_state_tab.p
--- a/newspaper/packages/jieba/posseg/char_state_tab.py
+++ b/newspaper/packages/jieba/posseg/char_state_tab.py
--- a/newspaper/packages/jieba/posseg/prob_emit.p
+++ b/newspaper/packages/jieba/posseg/prob_emit.p
--- a/newspaper/packages/jieba/posseg/prob_emit.py
+++ b/newspaper/packages/jieba/posseg/prob_emit.py
--- a/newspaper/packages/jieba/posseg/prob_start.p
+++ b/newspaper/packages/jieba/posseg/prob_start.p
--- a/newspaper/packages/jieba/posseg/prob_start.py
+++ b/newspaper/packages/jieba/posseg/prob_start.py
@ -1,256 +0,0 @@
-P={('B', 'a'): -4.762305214596967,
- ('B', 'ad'): -6.680066036784177,
- ('B', 'ag'): -3.14e+100,
- ('B', 'an'): -8.697083223018778,
- ('B', 'b'): -5.018374362109218,
- ('B', 'bg'): -3.14e+100,
- ('B', 'c'): -3.423880184954888,
- ('B', 'd'): -3.9750475297585357,
- ('B', 'df'): -8.888974230828882,
- ('B', 'dg'): -3.14e+100,
- ('B', 'e'): -8.563551830394255,
- ('B', 'en'): -3.14e+100,
- ('B', 'f'): -5.491630418482717,
- ('B', 'g'): -3.14e+100,
- ('B', 'h'): -13.533365129970255,
- ('B', 'i'): -6.1157847275557105,
- ('B', 'in'): -3.14e+100,
- ('B', 'j'): -5.0576191284681915,
- ('B', 'jn'): -3.14e+100,
- ('B', 'k'): -3.14e+100,
- ('B', 'l'): -4.905883584659895,
- ('B', 'ln'): -3.14e+100,
- ('B', 'm'): -3.6524299819046386,
- ('B', 'mg'): -3.14e+100,
- ('B', 'mq'): -6.78695300139688,
- ('B', 'n'): -1.6966257797548328,
- ('B', 'ng'): -3.14e+100,
- ('B', 'nr'): -2.2310495913769506,
- ('B', 'nrfg'): -5.873722175405573,
- ('B', 'nrt'): -4.985642733519195,
- ('B', 'ns'): -2.8228438314969213,
- ('B', 'nt'): -4.846091668182416,
- ('B', 'nz'): -3.94698846057672,
- ('B', 'o'): -8.433498702146057,
- ('B', 'p'): -4.200984132085048,
- ('B', 'q'): -6.998123858956596,
- ('B', 'qe'): -3.14e+100,
- ('B', 'qg'): -3.14e+100,
- ('B', 'r'): -3.4098187790818413,
- ('B', 'rg'): -3.14e+100,
- ('B', 'rr'): -12.434752841302146,
- ('B', 'rz'): -7.946116471570005,
- ('B', 's'): -5.522673590839954,
- ('B', 't'): -3.3647479094528574,
- ('B', 'tg'): -3.14e+100,
- ('B', 'u'): -9.163917277503234,
- ('B', 'ud'): -3.14e+100,
- ('B', 'ug'): -3.14e+100,
- ('B', 'uj'): -3.14e+100,
- ('B', 'ul'): -3.14e+100,
- ('B', 'uv'): -3.14e+100,
- ('B', 'uz'): -3.14e+100,
- ('B', 'v'): -2.6740584874265685,
- ('B', 'vd'): -9.044728760238115,
- ('B', 'vg'): -3.14e+100,
- ('B', 'vi'): -12.434752841302146,
- ('B', 'vn'): -4.3315610890163585,
- ('B', 'vq'): -12.147070768850364,
- ('B', 'w'): -3.14e+100,
- ('B', 'x'): -3.14e+100,
- ('B', 'y'): -9.844485675856319,
- ('B', 'yg'): -3.14e+100,
- ('B', 'z'): -7.045681111485645,
- ('B', 'zg'): -3.14e+100,
- ('E', 'a'): -3.14e+100,
- ('E', 'ad'): -3.14e+100,
- ('E', 'ag'): -3.14e+100,
- ('E', 'an'): -3.14e+100,
- ('E', 'b'): -3.14e+100,
- ('E', 'bg'): -3.14e+100,
- ('E', 'c'): -3.14e+100,
- ('E', 'd'): -3.14e+100,
- ('E', 'df'): -3.14e+100,
- ('E', 'dg'): -3.14e+100,
- ('E', 'e'): -3.14e+100,
- ('E', 'en'): -3.14e+100,
- ('E', 'f'): -3.14e+100,
- ('E', 'g'): -3.14e+100,
- ('E', 'h'): -3.14e+100,
- ('E', 'i'): -3.14e+100,
- ('E', 'in'): -3.14e+100,
- ('E', 'j'): -3.14e+100,
- ('E', 'jn'): -3.14e+100,
- ('E', 'k'): -3.14e+100,
- ('E', 'l'): -3.14e+100,
- ('E', 'ln'): -3.14e+100,
- ('E', 'm'): -3.14e+100,
- ('E', 'mg'): -3.14e+100,
- ('E', 'mq'): -3.14e+100,
- ('E', 'n'): -3.14e+100,
- ('E', 'ng'): -3.14e+100,
- ('E', 'nr'): -3.14e+100,
- ('E', 'nrfg'): -3.14e+100,
- ('E', 'nrt'): -3.14e+100,
- ('E', 'ns'): -3.14e+100,
- ('E', 'nt'): -3.14e+100,
- ('E', 'nz'): -3.14e+100,
- ('E', 'o'): -3.14e+100,
- ('E', 'p'): -3.14e+100,
- ('E', 'q'): -3.14e+100,
- ('E', 'qe'): -3.14e+100,
- ('E', 'qg'): -3.14e+100,
- ('E', 'r'): -3.14e+100,
- ('E', 'rg'): -3.14e+100,
- ('E', 'rr'): -3.14e+100,
- ('E', 'rz'): -3.14e+100,
- ('E', 's'): -3.14e+100,
- ('E', 't'): -3.14e+100,
- ('E', 'tg'): -3.14e+100,
- ('E', 'u'): -3.14e+100,
- ('E', 'ud'): -3.14e+100,
- ('E', 'ug'): -3.14e+100,
- ('E', 'uj'): -3.14e+100,
- ('E', 'ul'): -3.14e+100,
- ('E', 'uv'): -3.14e+100,
- ('E', 'uz'): -3.14e+100,
- ('E', 'v'): -3.14e+100,
- ('E', 'vd'): -3.14e+100,
- ('E', 'vg'): -3.14e+100,
- ('E', 'vi'): -3.14e+100,
- ('E', 'vn'): -3.14e+100,
- ('E', 'vq'): -3.14e+100,
- ('E', 'w'): -3.14e+100,
- ('E', 'x'): -3.14e+100,
- ('E', 'y'): -3.14e+100,
- ('E', 'yg'): -3.14e+100,
- ('E', 'z'): -3.14e+100,
- ('E', 'zg'): -3.14e+100,
- ('M', 'a'): -3.14e+100,
- ('M', 'ad'): -3.14e+100,
- ('M', 'ag'): -3.14e+100,
- ('M', 'an'): -3.14e+100,
- ('M', 'b'): -3.14e+100,
- ('M', 'bg'): -3.14e+100,
- ('M', 'c'): -3.14e+100,
- ('M', 'd'): -3.14e+100,
- ('M', 'df'): -3.14e+100,
- ('M', 'dg'): -3.14e+100,
- ('M', 'e'): -3.14e+100,
- ('M', 'en'): -3.14e+100,
- ('M', 'f'): -3.14e+100,
- ('M', 'g'): -3.14e+100,
- ('M', 'h'): -3.14e+100,
- ('M', 'i'): -3.14e+100,
- ('M', 'in'): -3.14e+100,
- ('M', 'j'): -3.14e+100,
- ('M', 'jn'): -3.14e+100,
- ('M', 'k'): -3.14e+100,
- ('M', 'l'): -3.14e+100,
- ('M', 'ln'): -3.14e+100,
- ('M', 'm'): -3.14e+100,
- ('M', 'mg'): -3.14e+100,
- ('M', 'mq'): -3.14e+100,
- ('M', 'n'): -3.14e+100,
- ('M', 'ng'): -3.14e+100,
- ('M', 'nr'): -3.14e+100,
- ('M', 'nrfg'): -3.14e+100,
- ('M', 'nrt'): -3.14e+100,
- ('M', 'ns'): -3.14e+100,
- ('M', 'nt'): -3.14e+100,
- ('M', 'nz'): -3.14e+100,
- ('M', 'o'): -3.14e+100,
- ('M', 'p'): -3.14e+100,
- ('M', 'q'): -3.14e+100,
- ('M', 'qe'): -3.14e+100,
- ('M', 'qg'): -3.14e+100,
- ('M', 'r'): -3.14e+100,
- ('M', 'rg'): -3.14e+100,
- ('M', 'rr'): -3.14e+100,
- ('M', 'rz'): -3.14e+100,
- ('M', 's'): -3.14e+100,
- ('M', 't'): -3.14e+100,
- ('M', 'tg'): -3.14e+100,
- ('M', 'u'): -3.14e+100,
- ('M', 'ud'): -3.14e+100,
- ('M', 'ug'): -3.14e+100,
- ('M', 'uj'): -3.14e+100,
- ('M', 'ul'): -3.14e+100,
- ('M', 'uv'): -3.14e+100,
- ('M', 'uz'): -3.14e+100,
- ('M', 'v'): -3.14e+100,
- ('M', 'vd'): -3.14e+100,
- ('M', 'vg'): -3.14e+100,
- ('M', 'vi'): -3.14e+100,
- ('M', 'vn'): -3.14e+100,
- ('M', 'vq'): -3.14e+100,
- ('M', 'w'): -3.14e+100,
- ('M', 'x'): -3.14e+100,
- ('M', 'y'): -3.14e+100,
- ('M', 'yg'): -3.14e+100,
- ('M', 'z'): -3.14e+100,
- ('M', 'zg'): -3.14e+100,
- ('S', 'a'): -3.9025396831295227,
- ('S', 'ad'): -11.048458480182255,
- ('S', 'ag'): -6.954113917960154,
- ('S', 'an'): -12.84021794941031,
- ('S', 'b'): -6.472888763970454,
- ('S', 'bg'): -3.14e+100,
- ('S', 'c'): -4.786966795861212,
- ('S', 'd'): -3.903919764181873,
- ('S', 'df'): -3.14e+100,
- ('S', 'dg'): -8.948397651299683,
- ('S', 'e'): -5.942513006281674,
- ('S', 'en'): -3.14e+100,
- ('S', 'f'): -5.194820249981676,
- ('S', 'g'): -6.507826815331734,
- ('S', 'h'): -8.650563207383884,
- ('S', 'i'): -3.14e+100,
- ('S', 'in'): -3.14e+100,
- ('S', 'j'): -4.911992119644354,
- ('S', 'jn'): -3.14e+100,
- ('S', 'k'): -6.940320595827818,
- ('S', 'l'): -3.14e+100,
- ('S', 'ln'): -3.14e+100,
- ('S', 'm'): -3.269200652116097,
- ('S', 'mg'): -10.825314928868044,
- ('S', 'mq'): -3.14e+100,
- ('S', 'n'): -3.8551483897645107,
- ('S', 'ng'): -4.913434861102905,
- ('S', 'nr'): -4.483663103956885,
- ('S', 'nrfg'): -3.14e+100,
- ('S', 'nrt'): -3.14e+100,
- ('S', 'ns'): -3.14e+100,
- ('S', 'nt'): -12.147070768850364,
- ('S', 'nz'): -3.14e+100,
- ('S', 'o'): -8.464460927750023,
- ('S', 'p'): -2.9868401813596317,
- ('S', 'q'): -4.888658618255058,
- ('S', 'qe'): -3.14e+100,
- ('S', 'qg'): -3.14e+100,
- ('S', 'r'): -2.7635336784127853,
- ('S', 'rg'): -10.275268591948773,
- ('S', 'rr'): -3.14e+100,
- ('S', 'rz'): -3.14e+100,
- ('S', 's'): -3.14e+100,
- ('S', 't'): -3.14e+100,
- ('S', 'tg'): -6.272842531880403,
- ('S', 'u'): -6.940320595827818,
- ('S', 'ud'): -7.728230161053767,
- ('S', 'ug'): -7.5394037026636855,
- ('S', 'uj'): -6.85251045118004,
- ('S', 'ul'): -8.4153713175535,
- ('S', 'uv'): -8.15808672228609,
- ('S', 'uz'): -9.299258625372996,
- ('S', 'v'): -3.053292303412302,
- ('S', 'vd'): -3.14e+100,
- ('S', 'vg'): -5.9430181843676895,
- ('S', 'vi'): -3.14e+100,
- ('S', 'vn'): -11.453923588290419,
- ('S', 'vq'): -3.14e+100,
- ('S', 'w'): -3.14e+100,
- ('S', 'x'): -8.427419656069674,
- ('S', 'y'): -6.1970794699489575,
- ('S', 'yg'): -13.533365129970255,
- ('S', 'z'): -3.14e+100,
- ('S', 'zg'): -3.14e+100}
--- a/newspaper/packages/jieba/posseg/prob_trans.p
+++ b/newspaper/packages/jieba/posseg/prob_trans.p
--- a/newspaper/packages/jieba/posseg/prob_trans.py
+++ b/newspaper/packages/jieba/posseg/prob_trans.py
--- a/newspaper/packages/jieba/posseg/viterbi.py
+++ b/newspaper/packages/jieba/posseg/viterbi.py
@ -1,43 +0,0 @@
-import operator
-MIN_FLOAT=-3.14e100
-
-def get_top_states(t_state_v,K=4):
-    items = t_state_v.items()
-    topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
-    return [x[0] for x in topK]
-
-def viterbi(obs, states, start_p, trans_p, emit_p):
-    V = [{}] #tabular
-    mem_path = [{}]
-    all_states = trans_p.keys()
-    for y in states.get(obs[0],all_states): #init
-        V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
-        mem_path[0][y] = ''
-    for t in range(1,len(obs)):
-        V.append({})
-        mem_path.append({})
-        prev_states = get_top_states(V[t-1])
-        prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
-
-        prev_states_expect_next = set( (y  for x in prev_states for y in trans_p[x].keys() ) )
-        obs_states = states.get(obs[t],all_states)
-        obs_states = set(obs_states) &  set(prev_states_expect_next)
-
-        if len(obs_states)==0: obs_states = all_states
-        for y in obs_states:
-            (prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states])
-            V[t][y] =prob
-            mem_path[t][y] = state
-
-    last = [(V[-1][y], y) for y in mem_path[-1].keys() ]
-    #if len(last)==0:
-        #print obs
-    (prob, state) = max(last)
-
-    route = [None] * len(obs)
-    i = len(obs)-1
-    while i>=0:
-        route[i] = state
-        state = mem_path[i][state]
-        i-=1
-    return (prob, route)
--- a/newspaper/text.py
+++ b/newspaper/text.py
@ -114,7 +114,7 @@ class StopWordsChinese(StopWords):
    def candidate_words(self, stripped_input):
        # jieba builds a tree that takes a while. avoid building
        # this tree if we don't use the chinese language
-        from .packages import jieba
+        import jieba
        return jieba.cut(stripped_input, cut_all=True)


--- a/requirements.txt
+++ b/requirements.txt
@ -6,4 +6,5 @@ lxml==3.3.5
 nltk==2.0.4
 requests==2.3.0
 six==1.7.3
+jieba==0.35
 -e git+https://github.com/karls/responses@regex-url-matching#egg=responses