Rudimentary tkinter app to speed up tagging/transcribing training data for tacotron
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

185 lines
5.2 KiB

import tkinter as tk
import vlc
from os import listdir, remove
from os.path import isfile, join
import json
import shutil
# Set up IBM speech to text
from ibm_watson import SpeechToTextV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
authenticator = IAMAuthenticator('b373X-km7u5pAaz2JoizXigcVFZFEB8CIntgYgWzbCQ4')
speech_to_text = SpeechToTextV1(
authenticator=authenticator
)
speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/0e7a3edf-309c-4e64-b345-8251781245e4')
speech_to_text.set_disable_ssl_verification(True)
# Set up Arpabet dict
def ARPA(text):
if len(text) == 0:
return ""
out = ''
for word_ in text.split(" "):
word=word_; end_chars = ''
while any(elem in word for elem in r"!?,.;") and len(word) > 1:
if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1]
if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1]
if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1]
if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1]
if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1]
else: break
try: word_arpa = thisdict[word.upper()]
except: word_arpa = ''
if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"
out = (out + " " + word + end_chars).strip()
if out[-1] != "": out = out + ""
return out
thisdict = {} # And load it
for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()):
thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
# Stuff for dataset path
IDX = 0
MYPATH = "./dataset/in"
OUTPATH = './dataset/'
# f = open("{}filelist.txt".format(OUTPATH), "w")
# f.write("")
# f.close()
# f = open("{}filelist_arpa.txt".format(OUTPATH), "w")
# f.write("")
# f.close()
onlyfiles = [f for f in listdir(MYPATH) if isfile(join(MYPATH, f))]
if len(onlyfiles) > 0:
img = onlyfiles[0]
def keyEvent(event):
global IDX
global v
if (event.keysym == 'Left'):
IDX = IDX - 1
if IDX < 0:
IDX = len(onlyfiles) - 1
w.config(text = onlyfiles[IDX])
v.set("")
if (event.keysym == 'Right'):
IDX = IDX + 1
if IDX >= len(onlyfiles):
IDX = 0
w.config(text = onlyfiles[IDX])
v.set("")
def comp_s(event):
global v
print(v.get())
v.set("")
def playTrack(event):
player = vlc.MediaPlayer("{}/{}".format(MYPATH, onlyfiles[IDX]))
player.play()
def transcribeTrack(event):
global v
with open("{}/{}".format(MYPATH, onlyfiles[IDX]),
'rb') as audio_file:
speech_recognition_results = speech_to_text.recognize(
audio=audio_file,
content_type='audio/wav',
word_alternatives_threshold=0.9,
).get_result()
# print(json.dumps(speech_recognition_results, indent=2))
v.set(speech_recognition_results["results"][0]["alternatives"][0]["transcript"])
def saveTrack(event):
f = open("{}filelist.txt".format(OUTPATH), "a")
f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o1.cget('text')))
f.close()
f = open("{}filelist_arpa.txt".format(OUTPATH), "a")
f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o2.cget('text')))
f.close()
move_audio_file(IDX, 'out')
def discardTrack(event):
move_audio_file(IDX, 'discard')
def callback(sv):
o1.config(text = "{}␤".format(sv.get().strip()), wraplength=500)
o2.config(text = ARPA(sv.get().strip()), wraplength=500)
def move_audio_file(idx, destination):
file_path = onlyfiles[idx]
onlyfiles.pop(idx)
idx = idx - 1
if idx < 0:
idx = len(onlyfiles) - 1
shutil.copyfile(MYPATH + '/' + file_path, OUTPATH + destination + '/' + file_path)
remove(MYPATH + '/' + file_path)
global v
w.config(text = onlyfiles[idx])
v.set("")
mw = tk.Tk()
mw.title('Training Data - {} remaining'.format(len(onlyfiles)))
mw.geometry('500x200')
mw.configure(bg='black')
toolbar = tk.Frame(master=mw, width='500', height='24', borderwidth=2, bg='slategray4', relief='raised')
toolbar.pack()
play_btn = tk.Button(toolbar, text="(C-p) Play track", command=None)
play_btn.pack(side='left')
save_btn = tk.Button(toolbar, text="(C-s) Save track", command=None)
save_btn.pack(side='left')
transcribe_btn = tk.Button(toolbar, text="(C-t) Transcribe track", command=None)
transcribe_btn.pack(side='left')
discard_btn = tk.Button(toolbar, text="(C-d) Discard track", command=None)
discard_btn.pack(side='left')
back = tk.Frame(master=mw, bg='black')
back.bind('<Key>', keyEvent)
back.bind("<Control-p>", playTrack)
back.bind("<Control-t>", transcribeTrack)
back.bind("<Control-s>", saveTrack)
back.bind("<Control-d>", discardTrack)
back.pack_propagate(0)
back.pack(fill=tk.BOTH, expand=1)
back.focus_set()
# canvas = tk.Canvas(master=back, width='1280', height='720', bg='black', highlightthickness=0)
# canvas.pack()
w = tk.Label(master=back, text=onlyfiles[IDX], bg='black', fg='white')
w.pack()
v = tk.StringVar()
v.trace("w", lambda name, index, mode, sv=v: callback(v))
e = tk.Entry(master=back, textvariable=v, width='500', bg='black', fg='white', highlightbackground='grey')
e.pack()
e.bind('<Return>', comp_s)
e.bind("<Control-p>", playTrack)
e.bind("<Control-t>", transcribeTrack)
e.bind("<Control-s>", saveTrack)
e.bind("<Control-d>", discardTrack)
o1 = tk.Label(master=back, text="text here", bg='black', fg='white')
o1.pack()
o2 = tk.Label(master=back, text="arpabet here", bg='black', fg='white')
o2.pack()
mw.mainloop()