import tkinter as tk import vlc from os import listdir, remove from os.path import isfile, join import json import shutil # Set up IBM speech to text from ibm_watson import SpeechToTextV1 from ibm_cloud_sdk_core.authenticators import IAMAuthenticator authenticator = IAMAuthenticator('b373X-km7u5pAaz2JoizXigcVFZFEB8CIntgYgWzbCQ4') speech_to_text = SpeechToTextV1( authenticator=authenticator ) speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/0e7a3edf-309c-4e64-b345-8251781245e4') speech_to_text.set_disable_ssl_verification(True) # Set up Arpabet dict def ARPA(text): if len(text) == 0: return "␤" out = '' for word_ in text.split(" "): word=word_; end_chars = '' while any(elem in word for elem in r"!?,.;") and len(word) > 1: if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1] if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1] if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1] if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1] if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1] else: break try: word_arpa = thisdict[word.upper()] except: word_arpa = '' if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}" out = (out + " " + word + end_chars).strip() if out[-1] != "␤": out = out + "␤" return out thisdict = {} # And load it for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()): thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip() # Stuff for dataset path IDX = 0 MYPATH = "./dataset/in" OUTPATH = './dataset/' # f = open("{}filelist.txt".format(OUTPATH), "w") # f.write("") # f.close() # f = open("{}filelist_arpa.txt".format(OUTPATH), "w") # f.write("") # f.close() onlyfiles = [f for f in listdir(MYPATH) if isfile(join(MYPATH, f))] if len(onlyfiles) > 0: img = onlyfiles[0] def keyEvent(event): global IDX global v if (event.keysym == 'Left'): IDX = IDX - 1 if IDX < 0: IDX = len(onlyfiles) - 1 w.config(text = onlyfiles[IDX]) v.set("") if (event.keysym == 'Right'): IDX = IDX + 1 if IDX >= len(onlyfiles): IDX = 0 w.config(text = onlyfiles[IDX]) v.set("") def comp_s(event): global v print(v.get()) v.set("") def playTrack(event): player = vlc.MediaPlayer("{}/{}".format(MYPATH, onlyfiles[IDX])) player.play() def transcribeTrack(event): global v with open("{}/{}".format(MYPATH, onlyfiles[IDX]), 'rb') as audio_file: speech_recognition_results = speech_to_text.recognize( audio=audio_file, content_type='audio/wav', word_alternatives_threshold=0.9, ).get_result() # print(json.dumps(speech_recognition_results, indent=2)) v.set(speech_recognition_results["results"][0]["alternatives"][0]["transcript"]) def saveTrack(event): f = open("{}filelist.txt".format(OUTPATH), "a") f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o1.cget('text'))) f.close() f = open("{}filelist_arpa.txt".format(OUTPATH), "a") f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o2.cget('text'))) f.close() move_audio_file(IDX, 'out') def discardTrack(event): move_audio_file(IDX, 'discard') def callback(sv): o1.config(text = "{}␤".format(sv.get().strip()), wraplength=500) o2.config(text = ARPA(sv.get().strip()), wraplength=500) def move_audio_file(idx, destination): file_path = onlyfiles[idx] onlyfiles.pop(idx) idx = idx - 1 if idx < 0: idx = len(onlyfiles) - 1 shutil.copyfile(MYPATH + '/' + file_path, OUTPATH + destination + '/' + file_path) remove(MYPATH + '/' + file_path) global v w.config(text = onlyfiles[idx]) v.set("") mw = tk.Tk() mw.title('Training Data - {} remaining'.format(len(onlyfiles))) mw.geometry('500x200') mw.configure(bg='black') toolbar = tk.Frame(master=mw, width='500', height='24', borderwidth=2, bg='slategray4', relief='raised') toolbar.pack() play_btn = tk.Button(toolbar, text="(C-p) Play track", command=None) play_btn.pack(side='left') save_btn = tk.Button(toolbar, text="(C-s) Save track", command=None) save_btn.pack(side='left') transcribe_btn = tk.Button(toolbar, text="(C-t) Transcribe track", command=None) transcribe_btn.pack(side='left') discard_btn = tk.Button(toolbar, text="(C-d) Discard track", command=None) discard_btn.pack(side='left') back = tk.Frame(master=mw, bg='black') back.bind('', keyEvent) back.bind("", playTrack) back.bind("", transcribeTrack) back.bind("", saveTrack) back.bind("", discardTrack) back.pack_propagate(0) back.pack(fill=tk.BOTH, expand=1) back.focus_set() # canvas = tk.Canvas(master=back, width='1280', height='720', bg='black', highlightthickness=0) # canvas.pack() w = tk.Label(master=back, text=onlyfiles[IDX], bg='black', fg='white') w.pack() v = tk.StringVar() v.trace("w", lambda name, index, mode, sv=v: callback(v)) e = tk.Entry(master=back, textvariable=v, width='500', bg='black', fg='white', highlightbackground='grey') e.pack() e.bind('', comp_s) e.bind("", playTrack) e.bind("", transcribeTrack) e.bind("", saveTrack) e.bind("", discardTrack) o1 = tk.Label(master=back, text="text here", bg='black', fg='white') o1.pack() o2 = tk.Label(master=back, text="arpabet here", bg='black', fg='white') o2.pack() mw.mainloop()