Initial commit

4 years ago · 4332977208
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
 dataset/
 .DS_Store
 env/
 *.tar
 merged.dict*
--- a/download_dict.py
+++ b/download_dict.py
@ -0,0 +1,5 @@
 from mega import Mega

 m = Mega.from_ephemeral()
 print("Downloading Dictionary...")
 m.download_from_url('https://mega.nz/#!yAMyFYCI!o_UmixbiIzosyYk-6O5xRZZDGpFRik_eMrZum-iQuhQ')
--- a/download_dict.sh
+++ b/download_dict.sh
@ -0,0 +1,9 @@
 #!/bin/sh

 git clone https://github.com/jeroenmeulenaar/python3-mega
 cd python3-mega
 pip install -r requirements.txt
 python setup.py install

 cd ..
 python download_dict.py
--- a/get_data.sh
+++ b/get_data.sh
@ -0,0 +1,10 @@
 #!/bin/sh

 wget -O dataset.tar --content-disposition --user-agent "macintosh" https://cloud.technicalincompetence.club/index.php/s/W8rLGrCKgrFXw8z/download
 tar -xvf dataset.tar
 mkdir -p dataset/in
 mv wavs/out/* dataset/in/
 rm -rf wavs

 mkdir -p dataset/out
 mkdir -p dataset/discard
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
 ibm-watson==4.5.0
 python-mega==0.1.0
 python-vlc==3.0.10114
--- a/window.py
+++ b/window.py
@ -0,0 +1,185 @@
 import tkinter as tk
 import vlc
 from os import listdir, remove
 from os.path import isfile, join
 import json
 import shutil

 # Set up IBM speech to text
 from ibm_watson import SpeechToTextV1
 from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

 authenticator = IAMAuthenticator('b373X-km7u5pAaz2JoizXigcVFZFEB8CIntgYgWzbCQ4')
 speech_to_text = SpeechToTextV1(
    authenticator=authenticator
 )

 speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/0e7a3edf-309c-4e64-b345-8251781245e4')
 speech_to_text.set_disable_ssl_verification(True)

 # Set up Arpabet dict
 def ARPA(text):
 	if len(text) == 0:
 		return "␤"
 	out = ''
 	for word_ in text.split(" "):
 		word=word_; end_chars = ''
 		while any(elem in word for elem in r"!?,.;") and len(word) > 1:
 			if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1]
 			if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1]
 			if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1]
 			if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1]
 			if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1]
 			else: break
 		try: word_arpa = thisdict[word.upper()]
 		except: word_arpa = ''
 		if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"
 		out = (out + " " + word + end_chars).strip()
 	if out[-1] != "␤": out = out + "␤"
 	return out

 thisdict = {}   # And load it
 for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()):
    thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()

 # Stuff for dataset path
 IDX = 0
 MYPATH = "./dataset/in"
 OUTPATH = './dataset/'

 # f = open("{}filelist.txt".format(OUTPATH), "w")
 # f.write("")
 # f.close()

 # f = open("{}filelist_arpa.txt".format(OUTPATH), "w")
 # f.write("")
 # f.close()

 onlyfiles = [f for f in listdir(MYPATH) if isfile(join(MYPATH, f))]
 if len(onlyfiles) > 0:
        img = onlyfiles[0]

 def keyEvent(event):
 	global IDX
 	global v
 	if (event.keysym == 'Left'):
 		IDX = IDX - 1

 		if IDX < 0:
 			IDX = len(onlyfiles) - 1

 		w.config(text = onlyfiles[IDX])
 		v.set("")

 	if (event.keysym == 'Right'):
 		IDX = IDX + 1

 		if IDX >= len(onlyfiles):
 			IDX = 0

 		w.config(text = onlyfiles[IDX])
 		v.set("")

 def comp_s(event):
 	global v
 	print(v.get())
 	v.set("")

 def playTrack(event):
 	player = vlc.MediaPlayer("{}/{}".format(MYPATH, onlyfiles[IDX]))
 	player.play()

 def transcribeTrack(event):
 	global v
 	with open("{}/{}".format(MYPATH, onlyfiles[IDX]),
               'rb') as audio_file:
 		speech_recognition_results = speech_to_text.recognize(
        	audio=audio_file,
 	        content_type='audio/wav',
    	    word_alternatives_threshold=0.9,
 	    ).get_result()
 	# print(json.dumps(speech_recognition_results, indent=2))
 	v.set(speech_recognition_results["results"][0]["alternatives"][0]["transcript"])

 def saveTrack(event):
 	f = open("{}filelist.txt".format(OUTPATH), "a")
 	f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o1.cget('text')))
 	f.close()

 	f = open("{}filelist_arpa.txt".format(OUTPATH), "a")
 	f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o2.cget('text')))
 	f.close()

 	move_audio_file(IDX, 'out')

 def discardTrack(event):
 	move_audio_file(IDX, 'discard')

 def callback(sv):
 	o1.config(text = "{}␤".format(sv.get().strip()), wraplength=500)
 	o2.config(text = ARPA(sv.get().strip()), wraplength=500)

 def move_audio_file(idx, destination):
 	file_path = onlyfiles[idx]
 	onlyfiles.pop(idx)
 	idx = idx - 1
 	if idx < 0:
 		idx = len(onlyfiles) - 1

 	shutil.copyfile(MYPATH + '/' + file_path, OUTPATH + destination + '/' + file_path)
 	remove(MYPATH + '/' + file_path)
 	global v
 	w.config(text = onlyfiles[idx])
 	v.set("")


 mw = tk.Tk()

 mw.title('Training Data - {} remaining'.format(len(onlyfiles)))
 mw.geometry('500x200')
 mw.configure(bg='black')


 toolbar = tk.Frame(master=mw, width='500', height='24', borderwidth=2, bg='slategray4', relief='raised')
 toolbar.pack()
 play_btn = tk.Button(toolbar, text="(C-p) Play track", command=None)
 play_btn.pack(side='left')
 save_btn = tk.Button(toolbar, text="(C-s) Save track", command=None)
 save_btn.pack(side='left')
 transcribe_btn = tk.Button(toolbar, text="(C-t) Transcribe track", command=None)
 transcribe_btn.pack(side='left')
 discard_btn = tk.Button(toolbar, text="(C-d) Discard track", command=None)
 discard_btn.pack(side='left')

 back = tk.Frame(master=mw, bg='black')
 back.bind('<Key>', keyEvent)
 back.bind("<Control-p>", playTrack)
 back.bind("<Control-t>", transcribeTrack)
 back.bind("<Control-s>", saveTrack)
 back.bind("<Control-d>", discardTrack)
 back.pack_propagate(0)
 back.pack(fill=tk.BOTH, expand=1)
 back.focus_set()

 # canvas = tk.Canvas(master=back, width='1280', height='720', bg='black', highlightthickness=0)  
 # canvas.pack()

 w = tk.Label(master=back, text=onlyfiles[IDX], bg='black', fg='white')
 w.pack()

 v = tk.StringVar()
 v.trace("w", lambda name, index, mode, sv=v: callback(v))
 e = tk.Entry(master=back, textvariable=v, width='500', bg='black', fg='white', highlightbackground='grey')
 e.pack()
 e.bind('<Return>', comp_s)
 e.bind("<Control-p>", playTrack)
 e.bind("<Control-t>", transcribeTrack)
 e.bind("<Control-s>", saveTrack)
 e.bind("<Control-d>", discardTrack)

 o1 = tk.Label(master=back, text="text here", bg='black', fg='white')
 o1.pack()
 o2 = tk.Label(master=back, text="arpabet here", bg='black', fg='white')
 o2.pack()

 mw.mainloop()