Rudimentary tkinter app to speed up tagging/transcribing training data for tacotron
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

185 lines
5.2 KiB

4 years ago
  1. import tkinter as tk
  2. import vlc
  3. from os import listdir, remove
  4. from os.path import isfile, join
  5. import json
  6. import shutil
  7. # Set up IBM speech to text
  8. from ibm_watson import SpeechToTextV1
  9. from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
  10. authenticator = IAMAuthenticator('b373X-km7u5pAaz2JoizXigcVFZFEB8CIntgYgWzbCQ4')
  11. speech_to_text = SpeechToTextV1(
  12. authenticator=authenticator
  13. )
  14. speech_to_text.set_service_url('https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/0e7a3edf-309c-4e64-b345-8251781245e4')
  15. speech_to_text.set_disable_ssl_verification(True)
  16. # Set up Arpabet dict
  17. def ARPA(text):
  18. if len(text) == 0:
  19. return ""
  20. out = ''
  21. for word_ in text.split(" "):
  22. word=word_; end_chars = ''
  23. while any(elem in word for elem in r"!?,.;") and len(word) > 1:
  24. if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1]
  25. if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1]
  26. if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1]
  27. if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1]
  28. if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1]
  29. else: break
  30. try: word_arpa = thisdict[word.upper()]
  31. except: word_arpa = ''
  32. if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"
  33. out = (out + " " + word + end_chars).strip()
  34. if out[-1] != "": out = out + ""
  35. return out
  36. thisdict = {} # And load it
  37. for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()):
  38. thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
  39. # Stuff for dataset path
  40. IDX = 0
  41. MYPATH = "./dataset/in"
  42. OUTPATH = './dataset/'
  43. # f = open("{}filelist.txt".format(OUTPATH), "w")
  44. # f.write("")
  45. # f.close()
  46. # f = open("{}filelist_arpa.txt".format(OUTPATH), "w")
  47. # f.write("")
  48. # f.close()
  49. onlyfiles = [f for f in listdir(MYPATH) if isfile(join(MYPATH, f))]
  50. if len(onlyfiles) > 0:
  51. img = onlyfiles[0]
  52. def keyEvent(event):
  53. global IDX
  54. global v
  55. if (event.keysym == 'Left'):
  56. IDX = IDX - 1
  57. if IDX < 0:
  58. IDX = len(onlyfiles) - 1
  59. w.config(text = onlyfiles[IDX])
  60. v.set("")
  61. if (event.keysym == 'Right'):
  62. IDX = IDX + 1
  63. if IDX >= len(onlyfiles):
  64. IDX = 0
  65. w.config(text = onlyfiles[IDX])
  66. v.set("")
  67. def comp_s(event):
  68. global v
  69. print(v.get())
  70. v.set("")
  71. def playTrack(event):
  72. player = vlc.MediaPlayer("{}/{}".format(MYPATH, onlyfiles[IDX]))
  73. player.play()
  74. def transcribeTrack(event):
  75. global v
  76. with open("{}/{}".format(MYPATH, onlyfiles[IDX]),
  77. 'rb') as audio_file:
  78. speech_recognition_results = speech_to_text.recognize(
  79. audio=audio_file,
  80. content_type='audio/wav',
  81. word_alternatives_threshold=0.9,
  82. ).get_result()
  83. # print(json.dumps(speech_recognition_results, indent=2))
  84. v.set(speech_recognition_results["results"][0]["alternatives"][0]["transcript"])
  85. def saveTrack(event):
  86. f = open("{}filelist.txt".format(OUTPATH), "a")
  87. f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o1.cget('text')))
  88. f.close()
  89. f = open("{}filelist_arpa.txt".format(OUTPATH), "a")
  90. f.write("wavs/out/{}|{}\n".format(onlyfiles[IDX], o2.cget('text')))
  91. f.close()
  92. move_audio_file(IDX, 'out')
  93. def discardTrack(event):
  94. move_audio_file(IDX, 'discard')
  95. def callback(sv):
  96. o1.config(text = "{}␤".format(sv.get().strip()), wraplength=500)
  97. o2.config(text = ARPA(sv.get().strip()), wraplength=500)
  98. def move_audio_file(idx, destination):
  99. file_path = onlyfiles[idx]
  100. onlyfiles.pop(idx)
  101. idx = idx - 1
  102. if idx < 0:
  103. idx = len(onlyfiles) - 1
  104. shutil.copyfile(MYPATH + '/' + file_path, OUTPATH + destination + '/' + file_path)
  105. remove(MYPATH + '/' + file_path)
  106. global v
  107. w.config(text = onlyfiles[idx])
  108. v.set("")
  109. mw = tk.Tk()
  110. mw.title('Training Data - {} remaining'.format(len(onlyfiles)))
  111. mw.geometry('500x200')
  112. mw.configure(bg='black')
  113. toolbar = tk.Frame(master=mw, width='500', height='24', borderwidth=2, bg='slategray4', relief='raised')
  114. toolbar.pack()
  115. play_btn = tk.Button(toolbar, text="(C-p) Play track", command=None)
  116. play_btn.pack(side='left')
  117. save_btn = tk.Button(toolbar, text="(C-s) Save track", command=None)
  118. save_btn.pack(side='left')
  119. transcribe_btn = tk.Button(toolbar, text="(C-t) Transcribe track", command=None)
  120. transcribe_btn.pack(side='left')
  121. discard_btn = tk.Button(toolbar, text="(C-d) Discard track", command=None)
  122. discard_btn.pack(side='left')
  123. back = tk.Frame(master=mw, bg='black')
  124. back.bind('<Key>', keyEvent)
  125. back.bind("<Control-p>", playTrack)
  126. back.bind("<Control-t>", transcribeTrack)
  127. back.bind("<Control-s>", saveTrack)
  128. back.bind("<Control-d>", discardTrack)
  129. back.pack_propagate(0)
  130. back.pack(fill=tk.BOTH, expand=1)
  131. back.focus_set()
  132. # canvas = tk.Canvas(master=back, width='1280', height='720', bg='black', highlightthickness=0)
  133. # canvas.pack()
  134. w = tk.Label(master=back, text=onlyfiles[IDX], bg='black', fg='white')
  135. w.pack()
  136. v = tk.StringVar()
  137. v.trace("w", lambda name, index, mode, sv=v: callback(v))
  138. e = tk.Entry(master=back, textvariable=v, width='500', bg='black', fg='white', highlightbackground='grey')
  139. e.pack()
  140. e.bind('<Return>', comp_s)
  141. e.bind("<Control-p>", playTrack)
  142. e.bind("<Control-t>", transcribeTrack)
  143. e.bind("<Control-s>", saveTrack)
  144. e.bind("<Control-d>", discardTrack)
  145. o1 = tk.Label(master=back, text="text here", bg='black', fg='white')
  146. o1.pack()
  147. o2 = tk.Label(master=back, text="arpabet here", bg='black', fg='white')
  148. o2.pack()
  149. mw.mainloop()