| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import re
- chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')
- # whether contain chinese character
- def contains_chinese(text):
- return bool(chinese_char_pattern.search(text))
- # replace special symbol
- def replace_corner_mark(text):
- text = text.replace('²', '平方')
- text = text.replace('³', '立方')
- return text
- # remove meaningless symbol
- def remove_bracket(text):
- text = text.replace('(', '').replace(')', '')
- text = text.replace('【', '').replace('】', '')
- text = text.replace('`', '').replace('`', '')
- text = text.replace("——", " ")
- return text
- # spell Arabic numerals
- def spell_out_number(text: str, inflect_parser):
- new_text = []
- st = None
- for i, c in enumerate(text):
- if not c.isdigit():
- if st is not None:
- num_str = inflect_parser.number_to_words(text[st: i])
- new_text.append(num_str)
- st = None
- new_text.append(c)
- else:
- if st is None:
- st = i
- if st is not None and st < len(text):
- num_str = inflect_parser.number_to_words(text[st:])
- new_text.append(num_str)
- return ''.join(new_text)
- # split paragrah logic:
- # 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
- # 2. cal sentence len according to lang
- # 3. split sentence according to puncatation
- def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False):
- def calc_utt_length(_text: str):
- if lang == "zh":
- return len(_text)
- else:
- return len(tokenize(_text))
- def should_merge(_text: str):
- if lang == "zh":
- return len(_text) < merge_len
- else:
- return len(tokenize(_text)) < merge_len
- if lang == "zh":
- pounc = ['。', '?', '!', ';', ':', '、', '.', '?', '!', ';']
- else:
- pounc = ['.', '?', '!', ';', ':']
- if comma_split:
- pounc.extend([',', ','])
- if text[-1] not in pounc:
- if lang == "zh":
- text += "。"
- else:
- text += "."
- st = 0
- utts = []
- for i, c in enumerate(text):
- if c in pounc:
- if len(text[st: i]) > 0:
- utts.append(text[st: i] + c)
- if i + 1 < len(text) and text[i + 1] in ['"', '”']:
- tmp = utts.pop(-1)
- utts.append(tmp + text[i + 1])
- st = i + 2
- else:
- st = i + 1
- final_utts = []
- cur_utt = ""
- for utt in utts:
- if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n:
- final_utts.append(cur_utt)
- cur_utt = ""
- cur_utt = cur_utt + utt
- if len(cur_utt) > 0:
- if should_merge(cur_utt) and len(final_utts) != 0:
- final_utts[-1] = final_utts[-1] + cur_utt
- else:
- final_utts.append(cur_utt)
- return final_utts
- # remove blank between chinese character
- def replace_blank(text: str):
- out_str = []
- for i, c in enumerate(text):
- if c == " ":
- if ((text[i + 1].isascii() and text[i + 1] != " ") and
- (text[i - 1].isascii() and text[i - 1] != " ")):
- out_str.append(c)
- else:
- out_str.append(c)
- return "".join(out_str)
|