dan
/
ppp-audio-tagger

import tkinter as tkimport vlcfrom os import listdir, removefrom os.path import isfile, joinimport jsonimport shutil
# Set up IBM speech to textfrom ibm_watson import SpeechToTextV1from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
authenticator = IAMAuthenticator('b373X-km7u5pAaz2JoizXigcVFZFEB8CIntgYgWzbCQ4')speech_to_text = SpeechToTextV1(    authenticator=authenticator)
speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/0e7a3edf-309c-4e64-b345-8251781245e4')speech_to_text.set_disable_ssl_verification(True)
# Set up Arpabet dictdef ARPA(text):	if len(text) == 0:		return "␤"	out = ''	for word_ in text.split(" "):		word=word_; end_chars = ''		while any(elem in word for elem in r"!?,.;") and len(word) > 1:			if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1]			if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1]			if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1]			if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1]			if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1]			else: break		try: word_arpa = thisdict[word.upper()]		except: word_arpa = ''		if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"		out = (out + " " + word + end_chars).strip()	if out[-1] != "␤": out = out + "␤"	return out
thisdict = {}   # And load itfor line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()):    thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
# Stuff for dataset pathIDX = 0MYPATH = "./dataset/in"OUTPATH = './dataset/'
# f = open("{}filelist.txt".format(OUTPATH), "w")# f.write("")# f.close()
# f = open("{}filelist_arpa.txt".format(OUTPATH), "w")# f.write("")# f.close()
onlyfiles = [f for f in listdir(MYPATH) if isfile(join(MYPATH, f))]if len(onlyfiles) > 0:        img = onlyfiles[0]
def keyEvent(event):	global IDX	global v	if (event.keysym == 'Left'):		IDX = IDX - 1
		if IDX < 0:			IDX = len(onlyfiles) - 1
		w.config(text = onlyfiles[IDX])		v.set("")
	if (event.keysym == 'Right'):		IDX = IDX + 1
		if IDX >= len(onlyfiles):			IDX = 0
		w.config(text = onlyfiles[IDX])		v.set("")
def comp_s(event):	global v	print(v.get())	v.set("")
def playTrack(event):	player = vlc.MediaPlayer("{}/{}".format(MYPATH, onlyfiles[IDX]))	player.play()
def transcribeTrack(event):	global v	with open("{}/{}".format(MYPATH, onlyfiles[IDX]),               'rb') as audio_file:		speech_recognition_results = speech_to_text.recognize(        	audio=audio_file,	        content_type='audio/wav',    	    word_alternatives_threshold=0.9,	    ).get_result()	# print(json.dumps(speech_recognition_results, indent=2))	v.set(speech_recognition_results["results"][0]["alternatives"][0]["transcript"])
def saveTrack(event):	f = open("{}filelist.txt".format(OUTPATH), "a")	f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o1.cget('text')))	f.close()
	f = open("{}filelist_arpa.txt".format(OUTPATH), "a")	f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o2.cget('text')))	f.close()
	move_audio_file(IDX, 'out')
def discardTrack(event):	move_audio_file(IDX, 'discard')
def callback(sv):	o1.config(text = "{}␤".format(sv.get().strip()), wraplength=500)	o2.config(text = ARPA(sv.get().strip()), wraplength=500)
def move_audio_file(idx, destination):	file_path = onlyfiles[idx]	onlyfiles.pop(idx)	idx = idx - 1	if idx < 0:		idx = len(onlyfiles) - 1
	shutil.copyfile(MYPATH + '/' + file_path, OUTPATH + destination + '/' + file_path)	remove(MYPATH + '/' + file_path)	global v	w.config(text = onlyfiles[idx])	v.set("")

mw = tk.Tk()
mw.title('Training Data - {} remaining'.format(len(onlyfiles)))mw.geometry('500x200')mw.configure(bg='black')

toolbar = tk.Frame(master=mw, width='500', height='24', borderwidth=2, bg='slategray4', relief='raised')toolbar.pack()play_btn = tk.Button(toolbar, text="(C-p) Play track", command=None)play_btn.pack(side='left')save_btn = tk.Button(toolbar, text="(C-s) Save track", command=None)save_btn.pack(side='left')transcribe_btn = tk.Button(toolbar, text="(C-t) Transcribe track", command=None)transcribe_btn.pack(side='left')discard_btn = tk.Button(toolbar, text="(C-d) Discard track", command=None)discard_btn.pack(side='left')
back = tk.Frame(master=mw, bg='black')back.bind('<Key>', keyEvent)back.bind("<Control-p>", playTrack)back.bind("<Control-t>", transcribeTrack)back.bind("<Control-s>", saveTrack)back.bind("<Control-d>", discardTrack)back.pack_propagate(0)back.pack(fill=tk.BOTH, expand=1)back.focus_set()
# canvas = tk.Canvas(master=back, width='1280', height='720', bg='black', highlightthickness=0)  # canvas.pack()
w = tk.Label(master=back, text=onlyfiles[IDX], bg='black', fg='white')w.pack()
v = tk.StringVar()v.trace("w", lambda name, index, mode, sv=v: callback(v))e = tk.Entry(master=back, textvariable=v, width='500', bg='black', fg='white', highlightbackground='grey')e.pack()e.bind('<Return>', comp_s)e.bind("<Control-p>", playTrack)e.bind("<Control-t>", transcribeTrack)e.bind("<Control-s>", saveTrack)e.bind("<Control-d>", discardTrack)
o1 = tk.Label(master=back, text="text here", bg='black', fg='white')o1.pack()o2 = tk.Label(master=back, text="arpabet here", bg='black', fg='white')o2.pack()
mw.mainloop()