def _index_files(storeDir, indexFile):
jieba.initialize()
store = SimpleFSDirectory(File(storeDir))
analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
_index_docs(indexFile, writer)
print('commit index')
writer.commit()
writer.close()
print('done')
```
``` python
def get_search_func():
jieba.initialize()
vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
searcher = IndexSearcher(DirectoryReader.open(SimpleFSDirectory(File(LUCENE_INDEX_DIR))))
search = search_func_factory(analyzer=analyzer,
searcher=searcher,
vm_env=vm_env)
return search
```
``` python
def __init__(self, slack, custom):
self.slack = slack
self.rundata = custom['data']
self.colorPrint = custom['colorPrint']
self.food_dir = "data/midnight.json"
self.food_dic = "data/dict.txt.big"
# find midnight channel
self.nochannel = False
rep = self.slack.api_call("channels.list")
self.channel_id = ""
for c in rep['channels']:
if c['name'].lower() == custom['food_channelname']:
self.channel_id = c['id']
break
if not self.channel_id:
self.colorPrint(
"No midnight channel",
"Restart when midnight channel can use",
color="FAIL")
self.nochannel = True
return
jieba.set_dictionary(self.food_dic)
jieba.initialize()
# add and del words
for word in self.rundata.get('FOOD_addword'):
jieba.add_word(word)
for word in self.rundata.get('FOOD_delword'):
jieba.del_word(word)
self.init()
```
``` python
def __call__(self, text, **kargs):
words = jieba.tokenize(text, mode="search")
token = Token()
for (w, start_pos, stop_pos) in words:
if not accepted_chars.match(w) and len(w) <= 1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
```
``` python
def testTokenize(self):
for content in test_contents:
result = jieba.tokenize(content)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print("testTokenize", file=sys.stderr)
```
``` python
def testTokenize_NOHMM(self):
for content in test_contents:
result = jieba.tokenize(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print("testTokenize_NOHMM", file=sys.stderr)
```
``` python
def cuttest(test_sent):
global g_mode
result = jieba.tokenize(test_sent,mode=g_mode)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
``` python
def cuttest(test_sent):
global g_mode
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```