dan
/
ppp-audio-tagger


								import tkinter as tk

								import vlc

								from os import listdir, remove

								from os.path import isfile, join

								import json

								import shutil


								# Set up IBM speech to text

								from ibm_watson import SpeechToTextV1

								from ibm_cloud_sdk_core.authenticators import IAMAuthenticator


								authenticator = IAMAuthenticator('b373X-km7u5pAaz2JoizXigcVFZFEB8CIntgYgWzbCQ4')

								speech_to_text = SpeechToTextV1(

								    authenticator=authenticator

								)


								speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/0e7a3edf-309c-4e64-b345-8251781245e4')

								speech_to_text.set_disable_ssl_verification(True)


								# Set up Arpabet dict

								def ARPA(text):

									if len(text) == 0:

										return "␤"

									out = ''

									for word_ in text.split(" "):

										word=word_; end_chars = ''

										while any(elem in word for elem in r"!?,.;") and len(word) > 1:

											if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1]

											if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1]

											if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1]

											if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1]

											if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1]

											else: break

										try: word_arpa = thisdict[word.upper()]

										except: word_arpa = ''

										if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"

										out = (out + " " + word + end_chars).strip()

									if out[-1] != "␤": out = out + "␤"

									return out


								thisdict = {}   # And load it

								for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()):

								    thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()


								# Stuff for dataset path

								IDX = 0

								MYPATH = "./dataset/in"

								OUTPATH = './dataset/'


								# f = open("{}filelist.txt".format(OUTPATH), "w")

								# f.write("")

								# f.close()


								# f = open("{}filelist_arpa.txt".format(OUTPATH), "w")

								# f.write("")

								# f.close()


								onlyfiles = [f for f in listdir(MYPATH) if isfile(join(MYPATH, f))]

								if len(onlyfiles) > 0:

								        img = onlyfiles[0]


								def keyEvent(event):

									global IDX

									global v

									if (event.keysym == 'Left'):

										IDX = IDX - 1


										if IDX < 0:

											IDX = len(onlyfiles) - 1


										w.config(text = onlyfiles[IDX])

										v.set("")


									if (event.keysym == 'Right'):

										IDX = IDX + 1


										if IDX >= len(onlyfiles):

											IDX = 0


										w.config(text = onlyfiles[IDX])

										v.set("")


								def comp_s(event):

									global v

									print(v.get())

									v.set("")


								def playTrack(event):

									player = vlc.MediaPlayer("{}/{}".format(MYPATH, onlyfiles[IDX]))

									player.play()


								def transcribeTrack(event):

									global v

									with open("{}/{}".format(MYPATH, onlyfiles[IDX]),

								               'rb') as audio_file:

										speech_recognition_results = speech_to_text.recognize(

								        	audio=audio_file,

									        content_type='audio/wav',

								    	    word_alternatives_threshold=0.9,

									    ).get_result()

									# print(json.dumps(speech_recognition_results, indent=2))

									v.set(speech_recognition_results["results"][0]["alternatives"][0]["transcript"])


								def saveTrack(event):

									f = open("{}filelist.txt".format(OUTPATH), "a")

									f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o1.cget('text')))

									f.close()


									f = open("{}filelist_arpa.txt".format(OUTPATH), "a")

									f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o2.cget('text')))

									f.close()


									move_audio_file(IDX, 'out')


								def discardTrack(event):

									move_audio_file(IDX, 'discard')


								def callback(sv):

									o1.config(text = "{}␤".format(sv.get().strip()), wraplength=500)

									o2.config(text = ARPA(sv.get().strip()), wraplength=500)


								def move_audio_file(idx, destination):

									file_path = onlyfiles[idx]

									onlyfiles.pop(idx)

									idx = idx - 1

									if idx < 0:

										idx = len(onlyfiles) - 1


									shutil.copyfile(MYPATH + '/' + file_path, OUTPATH + destination + '/' + file_path)

									remove(MYPATH + '/' + file_path)

									global v

									w.config(text = onlyfiles[idx])

									v.set("")


								mw = tk.Tk()


								mw.title('Training Data - {} remaining'.format(len(onlyfiles)))

								mw.geometry('500x200')

								mw.configure(bg='black')


								toolbar = tk.Frame(master=mw, width='500', height='24', borderwidth=2, bg='slategray4', relief='raised')

								toolbar.pack()

								play_btn = tk.Button(toolbar, text="(C-p) Play track", command=None)

								play_btn.pack(side='left')

								save_btn = tk.Button(toolbar, text="(C-s) Save track", command=None)

								save_btn.pack(side='left')

								transcribe_btn = tk.Button(toolbar, text="(C-t) Transcribe track", command=None)

								transcribe_btn.pack(side='left')

								discard_btn = tk.Button(toolbar, text="(C-d) Discard track", command=None)

								discard_btn.pack(side='left')


								back = tk.Frame(master=mw, bg='black')

								back.bind('<Key>', keyEvent)

								back.bind("<Control-p>", playTrack)

								back.bind("<Control-t>", transcribeTrack)

								back.bind("<Control-s>", saveTrack)

								back.bind("<Control-d>", discardTrack)

								back.pack_propagate(0)

								back.pack(fill=tk.BOTH, expand=1)

								back.focus_set()


								# canvas = tk.Canvas(master=back, width='1280', height='720', bg='black', highlightthickness=0)

								# canvas.pack()


								w = tk.Label(master=back, text=onlyfiles[IDX], bg='black', fg='white')

								w.pack()


								v = tk.StringVar()

								v.trace("w", lambda name, index, mode, sv=v: callback(v))

								e = tk.Entry(master=back, textvariable=v, width='500', bg='black', fg='white', highlightbackground='grey')

								e.pack()

								e.bind('<Return>', comp_s)

								e.bind("<Control-p>", playTrack)

								e.bind("<Control-t>", transcribeTrack)

								e.bind("<Control-s>", saveTrack)

								e.bind("<Control-d>", discardTrack)


								o1 = tk.Label(master=back, text="text here", bg='black', fg='white')

								o1.pack()

								o2 = tk.Label(master=back, text="arpabet here", bg='black', fg='white')

								o2.pack()


								mw.mainloop()