# Copyright 2015 moco_beta
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The tokenizer module supplies Token and Tokenizer classes.
Usage:
>>> from janome.tokenizer import Tokenizer
>>> t = Tokenizer()
>>> for token in t.tokenize('すもももももももものうち'):
... print(token)
...
すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
も 助詞,係助詞,*,*,*,*,も,モ,モ
もも 名詞,一般,*,*,*,*,もも,モモ,モモ
も 助詞,係助詞,*,*,*,*,も,モ,モ
もも 名詞,一般,*,*,*,*,もも,モモ,モモ
の 助詞,連体化,*,*,*,*,の,ノ,ノ
うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
with wakati ('分かち書き') mode:
>>> from janome.tokenizer import Tokenizer
>>> t = Tokenizer()
>>> for token in t.tokenize('すもももももももものうち', wakati=True):
... print(token)
...
すもも
も
もも
も
もも
の
うち
with user dictionary (IPAdic format):
.. code-block:: shell
$ cat examples/user_ipadic.csv
東京スカイツリー,1288,1288,4569,名詞,固有名詞,一般,*,*,*,東京スカイツリー,トウキョウスカイツリー,トウキョウスカイツリー
東武スカイツリーライン,1288,1288,4700,名詞,固有名詞,一般,*,*,*,東武スカイツリーライン,トウブスカイツリーライン,トウブスカイツリーライン
とうきょうスカイツリー駅,1288,1288,4143,名詞,固有名詞,一般,*,*,*,とうきょうスカイツリー駅,トウキョウスカイツリーエキ,トウキョウスカイツリーエキ
>>> t = Tokenizer("user_ipadic.csv", udic_enc="utf8")
>>> for token in t.tokenize('東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。'):
... print(token)...
...
東京スカイツリー 名詞,固有名詞,一般,*,*,*,東京スカイツリー,トウキョウスカイツリー,トウキョウスカイツリー
へ 助詞,格助詞,一般,*,*,*,へ,ヘ,エ
の 助詞,連体化,*,*,*,*,の,ノ,ノ
お越し 名詞,一般,*,*,*,*,お越し,オコシ,オコシ
は 助詞,係助詞,*,*,*,*,は,ハ,ワ
、 記号,読点,*,*,*,*,、,、,、
東武スカイツリーライン 名詞,固有名詞,一般,*,*,*,東武スカイツリーライン,トウブスカイツリーライン,トウブスカイツリーライン
「 記号,括弧開,*,*,*,*,「,「,「
とうきょうスカイツリー駅 名詞,固有名詞,一般,*,*,*,とうきょうスカイツリー駅,トウキョウスカイツリーエキ,トウキョウスカイツリーエキ
」 記号,括弧閉,*,*,*,*,」,」,」
が 助詞,格助詞,一般,*,*,*,が,ガ,ガ
便利 名詞,形容動詞語幹,*,*,*,*,便利,ベンリ,ベンリ
です 助動詞,*,*,*,特殊・デス,基本形,です,デス,デス
。 記号,句点,*,*,*,*,。,。,。
with user dictionary (simplified format):
.. code-block:: shell
$ cat examples/user_simpledic.csv
東京スカイツリー,カスタム名詞,トウキョウスカイツリー
東武スカイツリーライン,カスタム名詞,トウブスカイツリーライン
とうきょうスカイツリー駅,カスタム名詞,トウキョウスカイツリーエキ
>>> t = Tokenizer("user_simpledic.csv", udic_type="simpledic", udic_enc="utf8")
>>> for token in t.tokenize('東京スカイツリーへのお越しは、東武スカイツリーライン「とうきょうスカイツリー駅」が便利です。'):
... print(token)
"""
import sys
import os
from typing import Iterator, Union, Tuple, Optional, Any
from .lattice import Lattice, Node, SurfaceNode, BOS, EOS, NodeType # type: ignore
from .dic import UserDictionary, CompiledUserDictionary # type: ignore
from .system_dic import SystemDictionary, MMapSystemDictionary
from .fst import Matcher
try:
from janome.sysdic import all_fstdata, connections # type: ignore
except ImportError:
# hack for unit testing...
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parent_dir)
from sysdic import all_fstdata, connections # type: ignore
DEFAULT_MMAP_MODE = sys.maxsize > 2**32
[docs]
class Token(object):
"""
A Token object contains all information for a token.
"""
[docs]
def __init__(self, node: Node, extra: Optional[Tuple] = None):
self.node = node
self.extra = extra
def __getattr__(self, name) -> Any:
if name == 'surface':
return self.node.surface
elif name == 'part_of_speech':
return self.extra[0] if self.extra else self.node.part_of_speech
elif name == 'infl_type':
return self.extra[1] if self.extra else self.node.infl_type
elif name == 'infl_form':
return self.extra[2] if self.extra else self.node.infl_form
elif name == 'base_form':
return self.extra[3] if self.extra else self.node.base_form
elif name == 'reading':
return self.extra[4] if self.extra else self.node.reading
elif name == 'phonetic':
return self.extra[5] if self.extra else self.node.phonetic
elif name == 'node_type':
return self.node.node_type
else:
None
def __setattr__(self, name: str, value: Any) -> None:
# see https://stackoverflow.com/questions/61213745/typechecking-dynamically-added-attributes
object.__setattr__(self, name, value)
def __str__(self):
return f'{self.surface}\t' \
f'{self.part_of_speech},{self.infl_type},{self.infl_form},{self.base_form},{self.reading},{self.phonetic}'
[docs]
class Tokenizer(object):
"""
A Tokenizer tokenizes Japanese texts with system and optional user defined dictionary.
"""
MAX_CHUNK_SIZE = 1024
CHUNK_SIZE = 500
[docs]
def __init__(self, udic: str = '', *,
udic_enc: str = 'utf8',
udic_type: str = 'ipadic',
max_unknown_length: int = 1024,
wakati: bool = False,
mmap: bool = DEFAULT_MMAP_MODE,
dotfile: str = ''):
"""
Initialize Tokenizer object with optional arguments.
:param udic: (Optional) user dictionary file (CSV format) or directory path to compiled dictionary data
:param udic_enc: (Optional) character encoding for user dictionary. default is 'utf-8'
:param udic_type: (Optional) user dictionray type. supported types are 'ipadic' and 'simpledic'.
default is 'ipadic'
:param max_unknows_length: (Optional) max unknown word length. default is 1024.
:param wakati: (Optional) if given True load minimum sysdic data for 'wakati' mode.
:param mmap: (Optional) if given False, memory-mapped file mode is disabled.
Set this option to False on any environments that do not support mmap.
Default is True on 64bit architecture; otherwise False.
.. seealso:: https://janome.mocobeta.dev/en/#how-to-use-with-user-defined-dictionary
"""
self.sys_dic: Union[SystemDictionary, MMapSystemDictionary]
self.user_dic: Optional[Union[UserDictionary, CompiledUserDictionary]]
self.wakati = wakati
self.matcher = Matcher(all_fstdata())
if mmap:
self.sys_dic = MMapSystemDictionary.instance()
else:
self.sys_dic = SystemDictionary.instance()
if udic:
if udic.endswith('.csv'):
# build user dictionary from CSV
self.user_dic = UserDictionary(udic, udic_enc, udic_type, connections)
elif os.path.isdir(udic):
# load compiled user dictionary
self.user_dic = CompiledUserDictionary(udic, connections)
else:
self.user_dic = None
else:
self.user_dic = None
self.max_unknown_length = max_unknown_length
[docs]
def tokenize(self, text: str, *, wakati: bool = False, baseform_unk: bool = True, dotfile: str = '') \
-> Iterator[Union[Token, str]]:
"""
Tokenize the input text.
:param text: unicode string to be tokenized
:param wakati: (Optinal) if given True returns surface forms only. default is False.
:param baseform_unk: (Optional) if given True sets base_form attribute for unknown tokens. default is True.
:param dotfile: (Optional) if specified, graphviz dot file is output to the path for later visualizing
of the lattice graph. This option is ignored when the input length is
larger than MAX_CHUNK_SIZE.
:return: generator yielding tokens (wakati=False) or generator yielding string (wakati=True)
"""
if self.wakati:
wakati = True
if dotfile and len(text) < Tokenizer.MAX_CHUNK_SIZE:
return self.__tokenize_stream(text, wakati, baseform_unk, dotfile)
else:
return self.__tokenize_stream(text, wakati, baseform_unk, '')
def __tokenize_stream(self, text, wakati, baseform_unk, dotfile):
text = text.strip()
text_length = len(text)
processed = 0
while processed < text_length:
tokens, pos = self.__tokenize_partial(text[processed:], wakati, baseform_unk, dotfile)
for token in tokens:
yield token
processed += pos
def __tokenize_partial(self, text, wakati, baseform_unk, dotfile):
if self.wakati and not wakati:
raise WakatiModeOnlyException
chunk_size = min(len(text), Tokenizer.MAX_CHUNK_SIZE)
lattice = Lattice(chunk_size, self.sys_dic)
pos = 0
while not self.__should_split(text, pos):
encoded_partial_text = text[pos:pos + min(50, chunk_size - pos)].encode('utf-8')
# user dictionary
if self.user_dic:
entries = self.user_dic.lookup(encoded_partial_text)
for e in entries:
lattice.add(SurfaceNode(e, NodeType.USER_DICT))
matched = len(entries) > 0
# system dictionary
entries = self.sys_dic.lookup(encoded_partial_text, self.matcher)
for e in entries:
lattice.add(SurfaceNode(e, NodeType.SYS_DICT))
matched = len(entries) > 0
# unknown
cates = self.sys_dic.get_char_categories(text[pos])
if cates:
for cate in cates:
if matched and not self.sys_dic.unknown_invoked_always(cate):
continue
# unknown word length
length = self.sys_dic.unknown_length(cate) \
if not self.sys_dic.unknown_grouping(cate) else self.max_unknown_length
assert length >= 0
# buffer for unknown word
buf = text[pos]
for p in range(pos + 1, min(chunk_size, pos + length + 1)):
_cates = self.sys_dic.get_char_categories(text[p])
if cate in _cates or any(cate in _compat_cates for _compat_cates in _cates.values()):
buf += text[p]
else:
break
unknown_entries = self.sys_dic.unknowns.get(cate)
assert unknown_entries
for entry in unknown_entries:
left_id, right_id, cost, part_of_speech = entry
base_form = buf if baseform_unk else '*'
dummy_dict_entry = (buf, left_id, right_id, cost, part_of_speech, '*', '*', base_form, '*', '*')
lattice.add(Node(dummy_dict_entry, NodeType.UNKNOWN))
pos += lattice.forward()
lattice.end()
min_cost_path = lattice.backward()
assert isinstance(min_cost_path[0], BOS)
assert isinstance(min_cost_path[-1], EOS)
if wakati:
tokens = [node.surface for node in min_cost_path[1:-1]]
else:
tokens = []
for node in min_cost_path[1:-1]:
if type(node) is SurfaceNode and node.node_type is NodeType.SYS_DICT:
tokens.append(Token(node, self.sys_dic.lookup_extra(node.num)))
elif type(node) is SurfaceNode and node.node_type is NodeType.USER_DICT:
tokens.append(Token(node, self.user_dic.lookup_extra(node.num)))
else:
tokens.append(Token(node))
if dotfile:
lattice.generate_dotfile(filename=dotfile)
return (tokens, pos)
def __should_split(self, text, pos):
return \
pos >= len(text) or \
pos >= Tokenizer.MAX_CHUNK_SIZE or \
(pos >= Tokenizer.CHUNK_SIZE and self.__splittable(text[:pos]))
def __splittable(self, text):
return self.__is_punct(text[-1]) or self.__is_newline(text)
def __is_punct(self, c):
return c == u'、' or c == u'。' or c == u',' or c == u'.' or c == u'?' or c == u'?' or c == u'!' or c == u'!'
def __is_newline(self, text):
return text.endswith('\n\n') or text.endswith('\r\n\r\n')
[docs]
class WakatiModeOnlyException(Exception):
pass