Source code for deepke.relation_extraction.standard.tools.serializer

import re
import unicodedata
import jieba
import logging
from typing import List

logger = logging.getLogger(__name__)
jieba.setLogLevel(logging.INFO)


[docs]class Serializer(): def __init__(self, never_split: List = None, do_lower_case=True, do_chinese_split=False): self.never_split = never_split if never_split is not None else [] self.do_lower_case = do_lower_case self.do_chinese_split = do_chinese_split
[docs] def serialize(self, text, never_split: List = None): """ 将一段文本按照制定拆分规则,拆分成一个词汇List Args : text (String) : 所需拆分文本 never_split (List) : 不拆分的词,默认为空 Rerurn : output_tokens (List): 拆分后的结果 """ never_split = self.never_split + (never_split if never_split is not None else []) text = self._clean_text(text) if self.do_chinese_split: output_tokens = self._use_jieba_cut(text, never_split) return output_tokens text = self._tokenize_chinese_chars(text) orig_tokens = self._orig_tokenize(text) split_tokens = [] for token in orig_tokens: if self.do_lower_case and token not in never_split: token = token.lower() token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token, never_split=never_split)) output_tokens = self._whitespace_tokenize(" ".join(split_tokens)) return output_tokens
def _clean_text(self, text): """ 删除文本中无效字符以及空白字符 Arg : text (String) : 所需删除的文本 Return : "".join(output) (String) : 删除后的文本 """ output = [] for char in text: cp = ord(char) if cp == 0 or cp == 0xfffd or self.is_control(char): continue if self.is_whitespace(char): output.append(" ") else: output.append(char) return "".join(output) def _use_jieba_cut(self, text, never_split): """ 使用jieba分词 Args : text (String) : 所需拆分文本 never_split (List) : 不拆分的词 Return : tokens (List) : 拆分完的结果 """ for word in never_split: jieba.suggest_freq(word, True) tokens = jieba.lcut(text) if self.do_lower_case: tokens = [i.lower() for i in tokens] try: while True: tokens.remove(' ') except: return tokens def _tokenize_chinese_chars(self, text): """ 在CJK字符周围添加空格 Arg : text (String) : 所需拆分文本 Return : "".join(output) (String) : 添加完后的文本 """ output = [] for char in text: cp = ord(char) if self.is_chinese_char(cp): output.append(" ") output.append(char) output.append(" ") else: output.append(char) return "".join(output) def _orig_tokenize(self, text): """ 在空白和一些标点符号(如逗号或句点)上拆分文本 Arg : text (String) : 所需拆分文本 Return : tokens (List) : 分词完的结果 """ text = text.strip() if not text: return [] # 常见的断句标点 punc = """,.?!;: 、|,。?!;:《》「」【】/<>|\“ ”‘ ’""" punc_re = '|'.join(re.escape(x) for x in punc) tokens = re.sub(punc_re, lambda x: ' ' + x.group() + ' ', text) tokens = tokens.split() return tokens def _whitespace_tokenize(self, text): """ 进行基本的空白字符清理和分割 Arg : text (String) : 所需拆分文本 Return : tokens (List) : 分词完的结果 """ text = text.strip() if not text: return [] tokens = text.split() return tokens def _run_strip_accents(self, text): """ 从文本中去除重音符号 Arg : text (String) : 所需拆分文本 Return : "".join(output) (String) : 去除后的文本 """ text = unicodedata.normalize("NFD", text) output = [] for char in text: cat = unicodedata.category(char) if cat == "Mn": continue output.append(char) return "".join(output) def _run_split_on_punc(self, text, never_split=None): """ 通过标点符号拆分文本 Args : text (String) : 所需拆分文本 never_split (List) : 不拆分的词,默认为空 Return : ["".join(x) for x in output] (List) : 拆分完的结果 """ if never_split is not None and text in never_split: return [text] chars = list(text) i = 0 start_new_word = True output = [] while i < len(chars): char = chars[i] if self.is_punctuation(char): output.append([char]) start_new_word = True else: if start_new_word: output.append([]) start_new_word = False output[-1].append(char) i += 1 return ["".join(x) for x in output]
[docs] @staticmethod def is_control(char): """ 判断字符是否为控制字符 Arg : char : 字符 Return : bool : 判断结果 """ # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False
[docs] @staticmethod def is_whitespace(char): """ 判断字符是否为空白字符 Arg : char : 字符 Return : bool : 判断结果 """ # \t, \n, and \r are technically contorl characters but we treat them # as whitespace since they are generally considered as such. if char == " " or char == "\t" or char == "\n" or char == "\r": return True cat = unicodedata.category(char) if cat == "Zs": return True return False
[docs] @staticmethod def is_chinese_char(cp): """ 判断字符是否为中文字符 Arg : cp (char): 字符 Return : bool : 判断结果 """ # This defines a "chinese character" as anything in the CJK Unicode block: # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) # # Note that the CJK Unicode block is NOT all Japanese and Korean characters, # despite its name. The modern Korean Hangul alphabet is a different block, # as is Japanese Hiragana and Katakana. Those alphabets are used to write # space-separated words, so they are not treated specially and handled # like the all of the other languages. if ((cp >= 0x4E00 and cp <= 0x9FFF) or # (cp >= 0x3400 and cp <= 0x4DBF) or # (cp >= 0x20000 and cp <= 0x2A6DF) or # (cp >= 0x2A700 and cp <= 0x2B73F) or # (cp >= 0x2B740 and cp <= 0x2B81F) or # (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or # (cp >= 0x2F800 and cp <= 0x2FA1F)): # return True return False
[docs] @staticmethod def is_punctuation(char): """ 判断字符是否为标点字符 Arg : char : 字符 Return : bool : 判断结果 """ cp = ord(char) # We treat all non-letter/number ASCII as punctuation. # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): return True cat = unicodedata.category(char) if cat.startswith("P"): return True return False