@ -0,0 +1,19 @@ | |||
Copyright (c) 2017 Keith Ito | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
in the Software without restriction, including without limitation the rights | |||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
copies of the Software, and to permit persons to whom the Software is | |||
furnished to do so, subject to the following conditions: | |||
The above copyright notice and this permission notice shall be included in | |||
all copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |||
THE SOFTWARE. |
@ -0,0 +1,76 @@ | |||
""" from https://github.com/keithito/tacotron """ | |||
import re | |||
from text import cleaners | |||
from text.symbols import symbols | |||
# Mappings from symbol to numeric ID and vice versa: | |||
_symbol_to_id = {s: i for i, s in enumerate(symbols)} | |||
_id_to_symbol = {i: s for i, s in enumerate(symbols)} | |||
# Regular expression matching text enclosed in curly braces: | |||
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') | |||
def text_to_sequence(text, cleaner_names): | |||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. | |||
The text can optionally have ARPAbet sequences enclosed in curly braces embedded | |||
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." | |||
Args: | |||
text: string to convert to a sequence | |||
cleaner_names: names of the cleaner functions to run the text through | |||
Returns: | |||
List of integers corresponding to the symbols in the text | |||
''' | |||
sequence = [] | |||
# Check for curly braces and treat their contents as ARPAbet: | |||
while len(text): | |||
m = _curly_re.match(text) | |||
if not m: | |||
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) | |||
break | |||
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) | |||
sequence += _arpabet_to_sequence(m.group(2)) | |||
text = m.group(3) | |||
# Append EOS token | |||
sequence.append(_symbol_to_id['~']) | |||
return sequence | |||
def sequence_to_text(sequence): | |||
'''Converts a sequence of IDs back to a string''' | |||
result = '' | |||
for symbol_id in sequence: | |||
if symbol_id in _id_to_symbol: | |||
s = _id_to_symbol[symbol_id] | |||
# Enclose ARPAbet back in curly braces: | |||
if len(s) > 1 and s[0] == '@': | |||
s = '{%s}' % s[1:] | |||
result += s | |||
return result.replace('}{', ' ') | |||
def _clean_text(text, cleaner_names): | |||
for name in cleaner_names: | |||
cleaner = getattr(cleaners, name) | |||
if not cleaner: | |||
raise Exception('Unknown cleaner: %s' % name) | |||
text = cleaner(text) | |||
return text | |||
def _symbols_to_sequence(symbols): | |||
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] | |||
def _arpabet_to_sequence(text): | |||
return _symbols_to_sequence(['@' + s for s in text.split()]) | |||
def _should_keep_symbol(s): | |||
return s in _symbol_to_id and s is not '_' and s is not '~' |
@ -0,0 +1,90 @@ | |||
""" from https://github.com/keithito/tacotron """ | |||
''' | |||
Cleaners are transformations that run over the input text at both training and eval time. | |||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" | |||
hyperparameter. Some cleaners are English-specific. You'll typically want to use: | |||
1. "english_cleaners" for English text | |||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using | |||
the Unidecode library (https://pypi.python.org/pypi/Unidecode) | |||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update | |||
the symbols in symbols.py to match your data). | |||
''' | |||
import re | |||
from unidecode import unidecode | |||
from .numbers import normalize_numbers | |||
# Regular expression matching whitespace: | |||
_whitespace_re = re.compile(r'\s+') | |||
# List of (regular expression, replacement) pairs for abbreviations: | |||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ | |||
('mrs', 'misess'), | |||
('mr', 'mister'), | |||
('dr', 'doctor'), | |||
('st', 'saint'), | |||
('co', 'company'), | |||
('jr', 'junior'), | |||
('maj', 'major'), | |||
('gen', 'general'), | |||
('drs', 'doctors'), | |||
('rev', 'reverend'), | |||
('lt', 'lieutenant'), | |||
('hon', 'honorable'), | |||
('sgt', 'sergeant'), | |||
('capt', 'captain'), | |||
('esq', 'esquire'), | |||
('ltd', 'limited'), | |||
('col', 'colonel'), | |||
('ft', 'fort'), | |||
]] | |||
def expand_abbreviations(text): | |||
for regex, replacement in _abbreviations: | |||
text = re.sub(regex, replacement, text) | |||
return text | |||
def expand_numbers(text): | |||
return normalize_numbers(text) | |||
def lowercase(text): | |||
return text.lower() | |||
def collapse_whitespace(text): | |||
return re.sub(_whitespace_re, ' ', text) | |||
def convert_to_ascii(text): | |||
return unidecode(text) | |||
def basic_cleaners(text): | |||
'''Basic pipeline that lowercases and collapses whitespace without transliteration.''' | |||
text = lowercase(text) | |||
text = collapse_whitespace(text) | |||
return text | |||
def transliteration_cleaners(text): | |||
'''Pipeline for non-English text that transliterates to ASCII.''' | |||
text = convert_to_ascii(text) | |||
text = lowercase(text) | |||
text = collapse_whitespace(text) | |||
return text | |||
def english_cleaners(text): | |||
'''Pipeline for English text, including number and abbreviation expansion.''' | |||
text = convert_to_ascii(text) | |||
text = lowercase(text) | |||
text = expand_numbers(text) | |||
text = expand_abbreviations(text) | |||
text = collapse_whitespace(text) | |||
return text |
@ -0,0 +1,65 @@ | |||
""" from https://github.com/keithito/tacotron """ | |||
import re | |||
valid_symbols = [ | |||
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', | |||
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', | |||
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', | |||
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', | |||
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', | |||
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', | |||
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' | |||
] | |||
_valid_symbol_set = set(valid_symbols) | |||
class CMUDict: | |||
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' | |||
def __init__(self, file_or_path, keep_ambiguous=True): | |||
if isinstance(file_or_path, str): | |||
with open(file_or_path, encoding='latin-1') as f: | |||
entries = _parse_cmudict(f) | |||
else: | |||
entries = _parse_cmudict(file_or_path) | |||
if not keep_ambiguous: | |||
entries = {word: pron for word, pron in entries.items() if len(pron) == 1} | |||
self._entries = entries | |||
def __len__(self): | |||
return len(self._entries) | |||
def lookup(self, word): | |||
'''Returns list of ARPAbet pronunciations of the given word.''' | |||
return self._entries.get(word.upper()) | |||
_alt_re = re.compile(r'\([0-9]+\)') | |||
def _parse_cmudict(file): | |||
cmudict = {} | |||
for line in file: | |||
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): | |||
parts = line.split(' ') | |||
word = re.sub(_alt_re, '', parts[0]) | |||
pronunciation = _get_pronunciation(parts[1]) | |||
if pronunciation: | |||
if word in cmudict: | |||
cmudict[word].append(pronunciation) | |||
else: | |||
cmudict[word] = [pronunciation] | |||
return cmudict | |||
def _get_pronunciation(s): | |||
parts = s.strip().split(' ') | |||
for part in parts: | |||
if part not in _valid_symbol_set: | |||
return None | |||
return ' '.join(parts) |
@ -0,0 +1,71 @@ | |||
""" from https://github.com/keithito/tacotron """ | |||
import inflect | |||
import re | |||
_inflect = inflect.engine() | |||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') | |||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') | |||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') | |||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') | |||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') | |||
_number_re = re.compile(r'[0-9]+') | |||
def _remove_commas(m): | |||
return m.group(1).replace(',', '') | |||
def _expand_decimal_point(m): | |||
return m.group(1).replace('.', ' point ') | |||
def _expand_dollars(m): | |||
match = m.group(1) | |||
parts = match.split('.') | |||
if len(parts) > 2: | |||
return match + ' dollars' # Unexpected format | |||
dollars = int(parts[0]) if parts[0] else 0 | |||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 | |||
if dollars and cents: | |||
dollar_unit = 'dollar' if dollars == 1 else 'dollars' | |||
cent_unit = 'cent' if cents == 1 else 'cents' | |||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) | |||
elif dollars: | |||
dollar_unit = 'dollar' if dollars == 1 else 'dollars' | |||
return '%s %s' % (dollars, dollar_unit) | |||
elif cents: | |||
cent_unit = 'cent' if cents == 1 else 'cents' | |||
return '%s %s' % (cents, cent_unit) | |||
else: | |||
return 'zero dollars' | |||
def _expand_ordinal(m): | |||
return _inflect.number_to_words(m.group(0)) | |||
def _expand_number(m): | |||
num = int(m.group(0)) | |||
if num > 1000 and num < 3000: | |||
if num == 2000: | |||
return 'two thousand' | |||
elif num > 2000 and num < 2010: | |||
return 'two thousand ' + _inflect.number_to_words(num % 100) | |||
elif num % 100 == 0: | |||
return _inflect.number_to_words(num // 100) + ' hundred' | |||
else: | |||
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') | |||
else: | |||
return _inflect.number_to_words(num, andword='') | |||
def normalize_numbers(text): | |||
text = re.sub(_comma_number_re, _remove_commas, text) | |||
text = re.sub(_pounds_re, r'\1 pounds', text) | |||
text = re.sub(_dollars_re, _expand_dollars, text) | |||
text = re.sub(_decimal_number_re, _expand_decimal_point, text) | |||
text = re.sub(_ordinal_re, _expand_ordinal, text) | |||
text = re.sub(_number_re, _expand_number, text) | |||
return text |
@ -0,0 +1,17 @@ | |||
""" from https://github.com/keithito/tacotron """ | |||
''' | |||
Defines the set of symbols used in text input to the model. | |||
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' | |||
from text import cmudict | |||
_pad = '_' | |||
_eos = '~' | |||
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' | |||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): | |||
_arpabet = ['@' + s for s in cmudict.valid_symbols] | |||
# Export all symbols: | |||
symbols = [_pad, _eos] + list(_characters) + _arpabet |