|
@ -0,0 +1,185 @@ |
|
|
|
|
|
import tkinter as tk |
|
|
|
|
|
import vlc |
|
|
|
|
|
from os import listdir, remove |
|
|
|
|
|
from os.path import isfile, join |
|
|
|
|
|
import json |
|
|
|
|
|
import shutil |
|
|
|
|
|
|
|
|
|
|
|
# Set up IBM speech to text |
|
|
|
|
|
from ibm_watson import SpeechToTextV1 |
|
|
|
|
|
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator |
|
|
|
|
|
|
|
|
|
|
|
authenticator = IAMAuthenticator('b373X-km7u5pAaz2JoizXigcVFZFEB8CIntgYgWzbCQ4') |
|
|
|
|
|
speech_to_text = SpeechToTextV1( |
|
|
|
|
|
authenticator=authenticator |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/0e7a3edf-309c-4e64-b345-8251781245e4') |
|
|
|
|
|
speech_to_text.set_disable_ssl_verification(True) |
|
|
|
|
|
|
|
|
|
|
|
# Set up Arpabet dict |
|
|
|
|
|
def ARPA(text): |
|
|
|
|
|
if len(text) == 0: |
|
|
|
|
|
return "" |
|
|
|
|
|
out = '' |
|
|
|
|
|
for word_ in text.split(" "): |
|
|
|
|
|
word=word_; end_chars = '' |
|
|
|
|
|
while any(elem in word for elem in r"!?,.;") and len(word) > 1: |
|
|
|
|
|
if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1] |
|
|
|
|
|
if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1] |
|
|
|
|
|
if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1] |
|
|
|
|
|
if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1] |
|
|
|
|
|
if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1] |
|
|
|
|
|
else: break |
|
|
|
|
|
try: word_arpa = thisdict[word.upper()] |
|
|
|
|
|
except: word_arpa = '' |
|
|
|
|
|
if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}" |
|
|
|
|
|
out = (out + " " + word + end_chars).strip() |
|
|
|
|
|
if out[-1] != "": out = out + "" |
|
|
|
|
|
return out |
|
|
|
|
|
|
|
|
|
|
|
thisdict = {} # And load it |
|
|
|
|
|
for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()): |
|
|
|
|
|
thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip() |
|
|
|
|
|
|
|
|
|
|
|
# Stuff for dataset path |
|
|
|
|
|
IDX = 0 |
|
|
|
|
|
MYPATH = "./dataset/in" |
|
|
|
|
|
OUTPATH = './dataset/' |
|
|
|
|
|
|
|
|
|
|
|
# f = open("{}filelist.txt".format(OUTPATH), "w") |
|
|
|
|
|
# f.write("") |
|
|
|
|
|
# f.close() |
|
|
|
|
|
|
|
|
|
|
|
# f = open("{}filelist_arpa.txt".format(OUTPATH), "w") |
|
|
|
|
|
# f.write("") |
|
|
|
|
|
# f.close() |
|
|
|
|
|
|
|
|
|
|
|
onlyfiles = [f for f in listdir(MYPATH) if isfile(join(MYPATH, f))] |
|
|
|
|
|
if len(onlyfiles) > 0: |
|
|
|
|
|
img = onlyfiles[0] |
|
|
|
|
|
|
|
|
|
|
|
def keyEvent(event): |
|
|
|
|
|
global IDX |
|
|
|
|
|
global v |
|
|
|
|
|
if (event.keysym == 'Left'): |
|
|
|
|
|
IDX = IDX - 1 |
|
|
|
|
|
|
|
|
|
|
|
if IDX < 0: |
|
|
|
|
|
IDX = len(onlyfiles) - 1 |
|
|
|
|
|
|
|
|
|
|
|
w.config(text = onlyfiles[IDX]) |
|
|
|
|
|
v.set("") |
|
|
|
|
|
|
|
|
|
|
|
if (event.keysym == 'Right'): |
|
|
|
|
|
IDX = IDX + 1 |
|
|
|
|
|
|
|
|
|
|
|
if IDX >= len(onlyfiles): |
|
|
|
|
|
IDX = 0 |
|
|
|
|
|
|
|
|
|
|
|
w.config(text = onlyfiles[IDX]) |
|
|
|
|
|
v.set("") |
|
|
|
|
|
|
|
|
|
|
|
def comp_s(event): |
|
|
|
|
|
global v |
|
|
|
|
|
print(v.get()) |
|
|
|
|
|
v.set("") |
|
|
|
|
|
|
|
|
|
|
|
def playTrack(event): |
|
|
|
|
|
player = vlc.MediaPlayer("{}/{}".format(MYPATH, onlyfiles[IDX])) |
|
|
|
|
|
player.play() |
|
|
|
|
|
|
|
|
|
|
|
def transcribeTrack(event): |
|
|
|
|
|
global v |
|
|
|
|
|
with open("{}/{}".format(MYPATH, onlyfiles[IDX]), |
|
|
|
|
|
'rb') as audio_file: |
|
|
|
|
|
speech_recognition_results = speech_to_text.recognize( |
|
|
|
|
|
audio=audio_file, |
|
|
|
|
|
content_type='audio/wav', |
|
|
|
|
|
word_alternatives_threshold=0.9, |
|
|
|
|
|
).get_result() |
|
|
|
|
|
# print(json.dumps(speech_recognition_results, indent=2)) |
|
|
|
|
|
v.set(speech_recognition_results["results"][0]["alternatives"][0]["transcript"]) |
|
|
|
|
|
|
|
|
|
|
|
def saveTrack(event): |
|
|
|
|
|
f = open("{}filelist.txt".format(OUTPATH), "a") |
|
|
|
|
|
f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o1.cget('text'))) |
|
|
|
|
|
f.close() |
|
|
|
|
|
|
|
|
|
|
|
f = open("{}filelist_arpa.txt".format(OUTPATH), "a") |
|
|
|
|
|
f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o2.cget('text'))) |
|
|
|
|
|
f.close() |
|
|
|
|
|
|
|
|
|
|
|
move_audio_file(IDX, 'out') |
|
|
|
|
|
|
|
|
|
|
|
def discardTrack(event): |
|
|
|
|
|
move_audio_file(IDX, 'discard') |
|
|
|
|
|
|
|
|
|
|
|
def callback(sv): |
|
|
|
|
|
o1.config(text = "{}".format(sv.get().strip()), wraplength=500) |
|
|
|
|
|
o2.config(text = ARPA(sv.get().strip()), wraplength=500) |
|
|
|
|
|
|
|
|
|
|
|
def move_audio_file(idx, destination): |
|
|
|
|
|
file_path = onlyfiles[idx] |
|
|
|
|
|
onlyfiles.pop(idx) |
|
|
|
|
|
idx = idx - 1 |
|
|
|
|
|
if idx < 0: |
|
|
|
|
|
idx = len(onlyfiles) - 1 |
|
|
|
|
|
|
|
|
|
|
|
shutil.copyfile(MYPATH + '/' + file_path, OUTPATH + destination + '/' + file_path) |
|
|
|
|
|
remove(MYPATH + '/' + file_path) |
|
|
|
|
|
global v |
|
|
|
|
|
w.config(text = onlyfiles[idx]) |
|
|
|
|
|
v.set("") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mw = tk.Tk() |
|
|
|
|
|
|
|
|
|
|
|
mw.title('Training Data - {} remaining'.format(len(onlyfiles))) |
|
|
|
|
|
mw.geometry('500x200') |
|
|
|
|
|
mw.configure(bg='black') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
toolbar = tk.Frame(master=mw, width='500', height='24', borderwidth=2, bg='slategray4', relief='raised') |
|
|
|
|
|
toolbar.pack() |
|
|
|
|
|
play_btn = tk.Button(toolbar, text="(C-p) Play track", command=None) |
|
|
|
|
|
play_btn.pack(side='left') |
|
|
|
|
|
save_btn = tk.Button(toolbar, text="(C-s) Save track", command=None) |
|
|
|
|
|
save_btn.pack(side='left') |
|
|
|
|
|
transcribe_btn = tk.Button(toolbar, text="(C-t) Transcribe track", command=None) |
|
|
|
|
|
transcribe_btn.pack(side='left') |
|
|
|
|
|
discard_btn = tk.Button(toolbar, text="(C-d) Discard track", command=None) |
|
|
|
|
|
discard_btn.pack(side='left') |
|
|
|
|
|
|
|
|
|
|
|
back = tk.Frame(master=mw, bg='black') |
|
|
|
|
|
back.bind('<Key>', keyEvent) |
|
|
|
|
|
back.bind("<Control-p>", playTrack) |
|
|
|
|
|
back.bind("<Control-t>", transcribeTrack) |
|
|
|
|
|
back.bind("<Control-s>", saveTrack) |
|
|
|
|
|
back.bind("<Control-d>", discardTrack) |
|
|
|
|
|
back.pack_propagate(0) |
|
|
|
|
|
back.pack(fill=tk.BOTH, expand=1) |
|
|
|
|
|
back.focus_set() |
|
|
|
|
|
|
|
|
|
|
|
# canvas = tk.Canvas(master=back, width='1280', height='720', bg='black', highlightthickness=0) |
|
|
|
|
|
# canvas.pack() |
|
|
|
|
|
|
|
|
|
|
|
w = tk.Label(master=back, text=onlyfiles[IDX], bg='black', fg='white') |
|
|
|
|
|
w.pack() |
|
|
|
|
|
|
|
|
|
|
|
v = tk.StringVar() |
|
|
|
|
|
v.trace("w", lambda name, index, mode, sv=v: callback(v)) |
|
|
|
|
|
e = tk.Entry(master=back, textvariable=v, width='500', bg='black', fg='white', highlightbackground='grey') |
|
|
|
|
|
e.pack() |
|
|
|
|
|
e.bind('<Return>', comp_s) |
|
|
|
|
|
e.bind("<Control-p>", playTrack) |
|
|
|
|
|
e.bind("<Control-t>", transcribeTrack) |
|
|
|
|
|
e.bind("<Control-s>", saveTrack) |
|
|
|
|
|
e.bind("<Control-d>", discardTrack) |
|
|
|
|
|
|
|
|
|
|
|
o1 = tk.Label(master=back, text="text here", bg='black', fg='white') |
|
|
|
|
|
o1.pack() |
|
|
|
|
|
o2 = tk.Label(master=back, text="arpabet here", bg='black', fg='white') |
|
|
|
|
|
o2.pack() |
|
|
|
|
|
|
|
|
|
|
|
mw.mainloop() |