|
|
- import tkinter as tk
- import vlc
- from os import listdir, remove
- from os.path import isfile, join
- import json
- import shutil
-
- # Set up IBM speech to text
- from ibm_watson import SpeechToTextV1
- from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
-
- authenticator = IAMAuthenticator('b373X-km7u5pAaz2JoizXigcVFZFEB8CIntgYgWzbCQ4')
- speech_to_text = SpeechToTextV1(
- authenticator=authenticator
- )
-
- speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/0e7a3edf-309c-4e64-b345-8251781245e4')
- speech_to_text.set_disable_ssl_verification(True)
-
- # Set up Arpabet dict
- def ARPA(text):
- if len(text) == 0:
- return ""
- out = ''
- for word_ in text.split(" "):
- word=word_; end_chars = ''
- while any(elem in word for elem in r"!?,.;") and len(word) > 1:
- if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1]
- if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1]
- if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1]
- if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1]
- if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1]
- else: break
- try: word_arpa = thisdict[word.upper()]
- except: word_arpa = ''
- if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"
- out = (out + " " + word + end_chars).strip()
- if out[-1] != "": out = out + ""
- return out
-
- thisdict = {} # And load it
- for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()):
- thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
-
- # Stuff for dataset path
- IDX = 0
- MYPATH = "./dataset/in"
- OUTPATH = './dataset/'
-
- # f = open("{}filelist.txt".format(OUTPATH), "w")
- # f.write("")
- # f.close()
-
- # f = open("{}filelist_arpa.txt".format(OUTPATH), "w")
- # f.write("")
- # f.close()
-
- onlyfiles = [f for f in listdir(MYPATH) if isfile(join(MYPATH, f))]
- if len(onlyfiles) > 0:
- img = onlyfiles[0]
-
- def keyEvent(event):
- global IDX
- global v
- if (event.keysym == 'Left'):
- IDX = IDX - 1
-
- if IDX < 0:
- IDX = len(onlyfiles) - 1
-
- w.config(text = onlyfiles[IDX])
- v.set("")
-
- if (event.keysym == 'Right'):
- IDX = IDX + 1
-
- if IDX >= len(onlyfiles):
- IDX = 0
-
- w.config(text = onlyfiles[IDX])
- v.set("")
-
- def comp_s(event):
- global v
- print(v.get())
- v.set("")
-
- def playTrack(event):
- player = vlc.MediaPlayer("{}/{}".format(MYPATH, onlyfiles[IDX]))
- player.play()
-
- def transcribeTrack(event):
- global v
- with open("{}/{}".format(MYPATH, onlyfiles[IDX]),
- 'rb') as audio_file:
- speech_recognition_results = speech_to_text.recognize(
- audio=audio_file,
- content_type='audio/wav',
- word_alternatives_threshold=0.9,
- ).get_result()
- # print(json.dumps(speech_recognition_results, indent=2))
- v.set(speech_recognition_results["results"][0]["alternatives"][0]["transcript"])
-
- def saveTrack(event):
- f = open("{}filelist.txt".format(OUTPATH), "a")
- f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o1.cget('text')))
- f.close()
-
- f = open("{}filelist_arpa.txt".format(OUTPATH), "a")
- f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o2.cget('text')))
- f.close()
-
- move_audio_file(IDX, 'out')
-
- def discardTrack(event):
- move_audio_file(IDX, 'discard')
-
- def callback(sv):
- o1.config(text = "{}".format(sv.get().strip()), wraplength=500)
- o2.config(text = ARPA(sv.get().strip()), wraplength=500)
-
- def move_audio_file(idx, destination):
- file_path = onlyfiles[idx]
- onlyfiles.pop(idx)
- idx = idx - 1
- if idx < 0:
- idx = len(onlyfiles) - 1
-
- shutil.copyfile(MYPATH + '/' + file_path, OUTPATH + destination + '/' + file_path)
- remove(MYPATH + '/' + file_path)
- global v
- w.config(text = onlyfiles[idx])
- v.set("")
-
-
- mw = tk.Tk()
-
- mw.title('Training Data - {} remaining'.format(len(onlyfiles)))
- mw.geometry('500x200')
- mw.configure(bg='black')
-
-
- toolbar = tk.Frame(master=mw, width='500', height='24', borderwidth=2, bg='slategray4', relief='raised')
- toolbar.pack()
- play_btn = tk.Button(toolbar, text="(C-p) Play track", command=None)
- play_btn.pack(side='left')
- save_btn = tk.Button(toolbar, text="(C-s) Save track", command=None)
- save_btn.pack(side='left')
- transcribe_btn = tk.Button(toolbar, text="(C-t) Transcribe track", command=None)
- transcribe_btn.pack(side='left')
- discard_btn = tk.Button(toolbar, text="(C-d) Discard track", command=None)
- discard_btn.pack(side='left')
-
- back = tk.Frame(master=mw, bg='black')
- back.bind('<Key>', keyEvent)
- back.bind("<Control-p>", playTrack)
- back.bind("<Control-t>", transcribeTrack)
- back.bind("<Control-s>", saveTrack)
- back.bind("<Control-d>", discardTrack)
- back.pack_propagate(0)
- back.pack(fill=tk.BOTH, expand=1)
- back.focus_set()
-
- # canvas = tk.Canvas(master=back, width='1280', height='720', bg='black', highlightthickness=0)
- # canvas.pack()
-
- w = tk.Label(master=back, text=onlyfiles[IDX], bg='black', fg='white')
- w.pack()
-
- v = tk.StringVar()
- v.trace("w", lambda name, index, mode, sv=v: callback(v))
- e = tk.Entry(master=back, textvariable=v, width='500', bg='black', fg='white', highlightbackground='grey')
- e.pack()
- e.bind('<Return>', comp_s)
- e.bind("<Control-p>", playTrack)
- e.bind("<Control-t>", transcribeTrack)
- e.bind("<Control-s>", saveTrack)
- e.bind("<Control-d>", discardTrack)
-
- o1 = tk.Label(master=back, text="text here", bg='black', fg='white')
- o1.pack()
- o2 = tk.Label(master=back, text="arpabet here", bg='black', fg='white')
- o2.pack()
-
- mw.mainloop()
|