Save youtube before it dies
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

240 lines
6.8 KiB

1 year ago
  1. import datetime
  2. import subprocess
  3. import json
  4. import sqlite3
  5. import typesense
  6. from sentence_transformers import SentenceTransformer
  7. from dotenv import dotenv_values
  8. config = dotenv_values(".env")
  9. ts_client = typesense.Client({
  10. 'api_key': config["TYPESENSE_API_KEY"],
  11. 'nodes': [{
  12. 'host': 'typesense',
  13. 'port': '8108',
  14. 'protocol': 'http'
  15. }],
  16. 'connection_timeout_seconds': 2
  17. })
  18. embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2", cache_folder="/data")
  19. def fetch_or_create_search_key():
  20. conn = sqlite3.connect("/data/database.sqlite3")
  21. cursor = conn.cursor()
  22. cursor.execute("SELECT key FROM typesense_keys LIMIT 1")
  23. rows = cursor.fetchall()
  24. if len(rows) == 0:
  25. key = ts_client.keys.create({"description": "Search-only key.", "actions": ["documents:search"], "collections": ["*"]})
  26. key = key["value"]
  27. cursor.execute("INSERT INTO typesense_keys (key) VALUES (?)", (key,))
  28. conn.commit()
  29. else:
  30. key = rows[0][0]
  31. conn.close()
  32. return key
  33. def get_video_json(video_id):
  34. conn = sqlite3.connect("/data/database.sqlite3")
  35. cursor = conn.cursor()
  36. cursor.execute("""SELECT info, status FROM discovered_videos
  37. WHERE id = ? LIMIT 1""", (video_id,))
  38. result = cursor.fetchall()[0]
  39. data = json.loads(result[0])
  40. data["status"] = result[1]
  41. conn.close()
  42. return data
  43. def hide_videos(video_ids):
  44. conn = sqlite3.connect("/data/database.sqlite3")
  45. cursor = conn.cursor()
  46. cursor.execute(f"""UPDATE OR IGNORE discovered_videos
  47. SET status = 'HIDDEN'
  48. WHERE status = 'DISCOVERED'
  49. AND id IN ({','.join(['?'] * len(video_ids))})
  50. RETURNING status""", video_ids)
  51. new_status = cursor.fetchall()[0][0]
  52. conn.commit()
  53. conn.close()
  54. for video_id in video_ids:
  55. doc = {
  56. "id": video_id,
  57. "status": "HIDDEN"
  58. }
  59. ts_client.collections["discovered_videos"].documents[video_id].update(doc)
  60. return new_status
  61. def request_videos(video_ids):
  62. conn = sqlite3.connect("/data/database.sqlite3")
  63. cursor = conn.cursor()
  64. cursor.execute(f"""UPDATE OR ABORT discovered_videos
  65. SET status = 'REQUESTED'
  66. WHERE status = 'DISCOVERED'
  67. AND id IN ({','.join(['?'] * len(video_ids))})
  68. RETURNING status""", video_ids)
  69. new_status = cursor.fetchall()[0][0]
  70. conn.commit()
  71. conn.close()
  72. for video_id in video_ids:
  73. doc = {
  74. "id": video_id,
  75. "status": "REQUESTED"
  76. }
  77. ts_client.collections["discovered_videos"].documents[video_id].update(doc)
  78. return new_status
  79. def download_video(video_id):
  80. conn = sqlite3.connect("/data/database.sqlite3")
  81. cursor = conn.cursor()
  82. cursor.execute("""UPDATE OR FAIL discovered_videos
  83. set status = 'DOWNLOADING'
  84. where status = 'REQUESTED'
  85. and id = ?
  86. returning json_extract(info, '$.original_url')""", (video_id,))
  87. original_url = cursor.fetchall()[0][0]
  88. conn.commit()
  89. doc = {
  90. "id": video_id,
  91. "status": "DOWNLOADING"
  92. }
  93. ts_client.collections["discovered_videos"].documents[video_id].update(doc)
  94. command = [
  95. "/app/bin/yt-dlp",
  96. "-S",
  97. "res:1080",
  98. "-P",
  99. "/data",
  100. "-o",
  101. "%(extractor)s-store/%(channel)s - %(channel_id)s/%(id)s/%(title)s.%(ext)s",
  102. "--write-subs",
  103. "--write-auto-subs",
  104. "--sub-langs",
  105. "\"en-US,en,en-us,en-gb,en-GB\"",
  106. "--write-thumbnail",
  107. "--continue",
  108. "--embed-chapters",
  109. "--embed-subs",
  110. original_url
  111. ]
  112. returncode = subprocess.run(command).returncode
  113. if returncode != 0:
  114. cursor.execute("""UPDATE OR FAIL discovered_videos
  115. SET status = 'REQUESTED'
  116. WHERE status = 'DOWNLOADING'
  117. AND id = ?""", (video_id,))
  118. conn.commit()
  119. conn.close()
  120. doc = {
  121. "id": video_id,
  122. "status": "REQUESTED"
  123. }
  124. ts_client.collections["discovered_videos"].documents[video_id].update(doc)
  125. raise Exception(f"Download failed for URL: {original_url}")
  126. return
  127. cursor.execute("""UPDATE OR FAIL discovered_videos
  128. SET status = 'DOWNLOADED'
  129. WHERE status = 'DOWNLOADING'
  130. AND id = ?""", (video_id,))
  131. conn.commit()
  132. conn.close()
  133. doc = {
  134. "id": video_id,
  135. "status": "DOWNLOADED"
  136. }
  137. ts_client.collections["discovered_videos"].documents[video_id].update(doc)
  138. def scrape_url(url):
  139. command = [
  140. "/app/bin/yt-dlp",
  141. "--dump-json",
  142. "--write-subs",
  143. "--sponsorblock-mark",
  144. "all",
  145. "--sub-langs",
  146. "\"en-US,en,en-us,en-gb,en-GB\"",
  147. url
  148. ]
  149. output = subprocess.check_output(command).decode("utf-8")
  150. for line in output.splitlines():
  151. video = json.loads(line)
  152. del video["formats"]
  153. del video["requested_formats"]
  154. del video["thumbnails"]
  155. del video["automatic_captions"]
  156. data = json.dumps(video)
  157. video_id = video["id"]
  158. conn = sqlite3.connect("/data/database.sqlite3")
  159. cursor = conn.cursor()
  160. cursor.execute("INSERT INTO discovered_videos (id, info, status) VALUES (?, ?, 'DISCOVERED') ON CONFLICT(id) DO UPDATE SET info = excluded.info", (video_id, data))
  161. channel_id = video["channel_id"]
  162. playlist_id = video.get("playlist_id", None)
  163. if playlist_id and channel_id != playlist_id:
  164. cursor.execute("INSERT INTO videos_in_playlists (video_id, playlist_id, name) VALUES (?, ?, ?) ON CONFLICT(video_id, playlist_id) DO NOTHING", (video_id, playlist_id, video["playlist_name"]))
  165. conn.commit()
  166. cursor.execute("SELECT DISTINCT playlist_id FROM videos_in_playlists WHERE video_id = ?", (video_id,))
  167. rows = cursor.fetchall()
  168. playlist_ids = [row[0] for row in rows]
  169. conn.close()
  170. embeddings = embedding_model.encode(video["fulltitle"])
  171. document = {
  172. "id": video["id"],
  173. "fulltitle": video["fulltitle"],
  174. "title_vec": embeddings.tolist(),
  175. "description": video["description"],
  176. "channel": video["channel"],
  177. "channel_follower_count": video["channel_follower_count"],
  178. "channel_id": video["channel_id"],
  179. "duration": video["duration"],
  180. "view_count": video["view_count"],
  181. "upload_date": int(video["upload_date"]),
  182. "filesize_approx": video["filesize_approx"],
  183. "extractor": video["extractor"],
  184. "thumbnail": video["thumbnail"],
  185. "status": "DISCOVERED",
  186. "requested_by": "",
  187. "playlist_ids": playlist_ids,
  188. }
  189. ts_client.collections["discovered_videos"].documents.upsert(document)