commit 433297720896628cadd48415f8d2d919d9147018 Author: Daniel Muckerman Date: Fri Jul 3 16:40:28 2020 -0400 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6c03a27 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +dataset/ +.DS_Store +env/ +*.tar +merged.dict* diff --git a/download_dict.py b/download_dict.py new file mode 100644 index 0000000..91347ad --- /dev/null +++ b/download_dict.py @@ -0,0 +1,5 @@ +from mega import Mega + +m = Mega.from_ephemeral() +print("Downloading Dictionary...") +m.download_from_url('https://mega.nz/#!yAMyFYCI!o_UmixbiIzosyYk-6O5xRZZDGpFRik_eMrZum-iQuhQ') diff --git a/download_dict.sh b/download_dict.sh new file mode 100755 index 0000000..03ed537 --- /dev/null +++ b/download_dict.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +git clone https://github.com/jeroenmeulenaar/python3-mega +cd python3-mega +pip install -r requirements.txt +python setup.py install + +cd .. +python download_dict.py \ No newline at end of file diff --git a/get_data.sh b/get_data.sh new file mode 100755 index 0000000..7001778 --- /dev/null +++ b/get_data.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +wget -O dataset.tar --content-disposition --user-agent "macintosh" https://cloud.technicalincompetence.club/index.php/s/W8rLGrCKgrFXw8z/download +tar -xvf dataset.tar +mkdir -p dataset/in +mv wavs/out/* dataset/in/ +rm -rf wavs + +mkdir -p dataset/out +mkdir -p dataset/discard \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3968277 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +ibm-watson==4.5.0 +python-mega==0.1.0 +python-vlc==3.0.10114 \ No newline at end of file diff --git a/window.py b/window.py new file mode 100644 index 0000000..16a1347 --- /dev/null +++ b/window.py @@ -0,0 +1,185 @@ +import tkinter as tk +import vlc +from os import listdir, remove +from os.path import isfile, join +import json +import shutil + +# Set up IBM speech to text +from ibm_watson import SpeechToTextV1 +from ibm_cloud_sdk_core.authenticators import IAMAuthenticator + +authenticator = IAMAuthenticator('b373X-km7u5pAaz2JoizXigcVFZFEB8CIntgYgWzbCQ4') +speech_to_text = SpeechToTextV1( + authenticator=authenticator +) + +speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/0e7a3edf-309c-4e64-b345-8251781245e4') +speech_to_text.set_disable_ssl_verification(True) + +# Set up Arpabet dict +def ARPA(text): + if len(text) == 0: + return "␤" + out = '' + for word_ in text.split(" "): + word=word_; end_chars = '' + while any(elem in word for elem in r"!?,.;") and len(word) > 1: + if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1] + if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1] + if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1] + if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1] + if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1] + else: break + try: word_arpa = thisdict[word.upper()] + except: word_arpa = '' + if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}" + out = (out + " " + word + end_chars).strip() + if out[-1] != "␤": out = out + "␤" + return out + +thisdict = {} # And load it +for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()): + thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip() + +# Stuff for dataset path +IDX = 0 +MYPATH = "./dataset/in" +OUTPATH = './dataset/' + +# f = open("{}filelist.txt".format(OUTPATH), "w") +# f.write("") +# f.close() + +# f = open("{}filelist_arpa.txt".format(OUTPATH), "w") +# f.write("") +# f.close() + +onlyfiles = [f for f in listdir(MYPATH) if isfile(join(MYPATH, f))] +if len(onlyfiles) > 0: + img = onlyfiles[0] + +def keyEvent(event): + global IDX + global v + if (event.keysym == 'Left'): + IDX = IDX - 1 + + if IDX < 0: + IDX = len(onlyfiles) - 1 + + w.config(text = onlyfiles[IDX]) + v.set("") + + if (event.keysym == 'Right'): + IDX = IDX + 1 + + if IDX >= len(onlyfiles): + IDX = 0 + + w.config(text = onlyfiles[IDX]) + v.set("") + +def comp_s(event): + global v + print(v.get()) + v.set("") + +def playTrack(event): + player = vlc.MediaPlayer("{}/{}".format(MYPATH, onlyfiles[IDX])) + player.play() + +def transcribeTrack(event): + global v + with open("{}/{}".format(MYPATH, onlyfiles[IDX]), + 'rb') as audio_file: + speech_recognition_results = speech_to_text.recognize( + audio=audio_file, + content_type='audio/wav', + word_alternatives_threshold=0.9, + ).get_result() + # print(json.dumps(speech_recognition_results, indent=2)) + v.set(speech_recognition_results["results"][0]["alternatives"][0]["transcript"]) + +def saveTrack(event): + f = open("{}filelist.txt".format(OUTPATH), "a") + f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o1.cget('text'))) + f.close() + + f = open("{}filelist_arpa.txt".format(OUTPATH), "a") + f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o2.cget('text'))) + f.close() + + move_audio_file(IDX, 'out') + +def discardTrack(event): + move_audio_file(IDX, 'discard') + +def callback(sv): + o1.config(text = "{}␤".format(sv.get().strip()), wraplength=500) + o2.config(text = ARPA(sv.get().strip()), wraplength=500) + +def move_audio_file(idx, destination): + file_path = onlyfiles[idx] + onlyfiles.pop(idx) + idx = idx - 1 + if idx < 0: + idx = len(onlyfiles) - 1 + + shutil.copyfile(MYPATH + '/' + file_path, OUTPATH + destination + '/' + file_path) + remove(MYPATH + '/' + file_path) + global v + w.config(text = onlyfiles[idx]) + v.set("") + + +mw = tk.Tk() + +mw.title('Training Data - {} remaining'.format(len(onlyfiles))) +mw.geometry('500x200') +mw.configure(bg='black') + + +toolbar = tk.Frame(master=mw, width='500', height='24', borderwidth=2, bg='slategray4', relief='raised') +toolbar.pack() +play_btn = tk.Button(toolbar, text="(C-p) Play track", command=None) +play_btn.pack(side='left') +save_btn = tk.Button(toolbar, text="(C-s) Save track", command=None) +save_btn.pack(side='left') +transcribe_btn = tk.Button(toolbar, text="(C-t) Transcribe track", command=None) +transcribe_btn.pack(side='left') +discard_btn = tk.Button(toolbar, text="(C-d) Discard track", command=None) +discard_btn.pack(side='left') + +back = tk.Frame(master=mw, bg='black') +back.bind('', keyEvent) +back.bind("", playTrack) +back.bind("", transcribeTrack) +back.bind("", saveTrack) +back.bind("", discardTrack) +back.pack_propagate(0) +back.pack(fill=tk.BOTH, expand=1) +back.focus_set() + +# canvas = tk.Canvas(master=back, width='1280', height='720', bg='black', highlightthickness=0) +# canvas.pack() + +w = tk.Label(master=back, text=onlyfiles[IDX], bg='black', fg='white') +w.pack() + +v = tk.StringVar() +v.trace("w", lambda name, index, mode, sv=v: callback(v)) +e = tk.Entry(master=back, textvariable=v, width='500', bg='black', fg='white', highlightbackground='grey') +e.pack() +e.bind('', comp_s) +e.bind("", playTrack) +e.bind("", transcribeTrack) +e.bind("", saveTrack) +e.bind("", discardTrack) + +o1 = tk.Label(master=back, text="text here", bg='black', fg='white') +o1.pack() +o2 = tk.Label(master=back, text="arpabet here", bg='black', fg='white') +o2.pack() + +mw.mainloop()