Save youtube before it dies
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

240 lines
6.8 KiB

import datetime
import subprocess
import json
import sqlite3
import typesense
from sentence_transformers import SentenceTransformer
from dotenv import dotenv_values
config = dotenv_values(".env")
ts_client = typesense.Client({
'api_key': config["TYPESENSE_API_KEY"],
'nodes': [{
'host': 'typesense',
'port': '8108',
'protocol': 'http'
}],
'connection_timeout_seconds': 2
})
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2", cache_folder="/data")
def fetch_or_create_search_key():
conn = sqlite3.connect("/data/database.sqlite3")
cursor = conn.cursor()
cursor.execute("SELECT key FROM typesense_keys LIMIT 1")
rows = cursor.fetchall()
if len(rows) == 0:
key = ts_client.keys.create({"description": "Search-only key.", "actions": ["documents:search"], "collections": ["*"]})
key = key["value"]
cursor.execute("INSERT INTO typesense_keys (key) VALUES (?)", (key,))
conn.commit()
else:
key = rows[0][0]
conn.close()
return key
def get_video_json(video_id):
conn = sqlite3.connect("/data/database.sqlite3")
cursor = conn.cursor()
cursor.execute("""SELECT info, status FROM discovered_videos
WHERE id = ? LIMIT 1""", (video_id,))
result = cursor.fetchall()[0]
data = json.loads(result[0])
data["status"] = result[1]
conn.close()
return data
def hide_videos(video_ids):
conn = sqlite3.connect("/data/database.sqlite3")
cursor = conn.cursor()
cursor.execute(f"""UPDATE OR IGNORE discovered_videos
SET status = 'HIDDEN'
WHERE status = 'DISCOVERED'
AND id IN ({','.join(['?'] * len(video_ids))})
RETURNING status""", video_ids)
new_status = cursor.fetchall()[0][0]
conn.commit()
conn.close()
for video_id in video_ids:
doc = {
"id": video_id,
"status": "HIDDEN"
}
ts_client.collections["discovered_videos"].documents[video_id].update(doc)
return new_status
def request_videos(video_ids):
conn = sqlite3.connect("/data/database.sqlite3")
cursor = conn.cursor()
cursor.execute(f"""UPDATE OR ABORT discovered_videos
SET status = 'REQUESTED'
WHERE status = 'DISCOVERED'
AND id IN ({','.join(['?'] * len(video_ids))})
RETURNING status""", video_ids)
new_status = cursor.fetchall()[0][0]
conn.commit()
conn.close()
for video_id in video_ids:
doc = {
"id": video_id,
"status": "REQUESTED"
}
ts_client.collections["discovered_videos"].documents[video_id].update(doc)
return new_status
def download_video(video_id):
conn = sqlite3.connect("/data/database.sqlite3")
cursor = conn.cursor()
cursor.execute("""UPDATE OR FAIL discovered_videos
set status = 'DOWNLOADING'
where status = 'REQUESTED'
and id = ?
returning json_extract(info, '$.original_url')""", (video_id,))
original_url = cursor.fetchall()[0][0]
conn.commit()
doc = {
"id": video_id,
"status": "DOWNLOADING"
}
ts_client.collections["discovered_videos"].documents[video_id].update(doc)
command = [
"/app/bin/yt-dlp",
"-S",
"res:1080",
"-P",
"/data",
"-o",
"%(extractor)s-store/%(channel)s - %(channel_id)s/%(id)s/%(title)s.%(ext)s",
"--write-subs",
"--write-auto-subs",
"--sub-langs",
"\"en-US,en,en-us,en-gb,en-GB\"",
"--write-thumbnail",
"--continue",
"--embed-chapters",
"--embed-subs",
original_url
]
returncode = subprocess.run(command).returncode
if returncode != 0:
cursor.execute("""UPDATE OR FAIL discovered_videos
SET status = 'REQUESTED'
WHERE status = 'DOWNLOADING'
AND id = ?""", (video_id,))
conn.commit()
conn.close()
doc = {
"id": video_id,
"status": "REQUESTED"
}
ts_client.collections["discovered_videos"].documents[video_id].update(doc)
raise Exception(f"Download failed for URL: {original_url}")
return
cursor.execute("""UPDATE OR FAIL discovered_videos
SET status = 'DOWNLOADED'
WHERE status = 'DOWNLOADING'
AND id = ?""", (video_id,))
conn.commit()
conn.close()
doc = {
"id": video_id,
"status": "DOWNLOADED"
}
ts_client.collections["discovered_videos"].documents[video_id].update(doc)
def scrape_url(url):
command = [
"/app/bin/yt-dlp",
"--dump-json",
"--write-subs",
"--sponsorblock-mark",
"all",
"--sub-langs",
"\"en-US,en,en-us,en-gb,en-GB\"",
url
]
output = subprocess.check_output(command).decode("utf-8")
for line in output.splitlines():
video = json.loads(line)
del video["formats"]
del video["requested_formats"]
del video["thumbnails"]
del video["automatic_captions"]
data = json.dumps(video)
video_id = video["id"]
conn = sqlite3.connect("/data/database.sqlite3")
cursor = conn.cursor()
cursor.execute("INSERT INTO discovered_videos (id, info, status) VALUES (?, ?, 'DISCOVERED') ON CONFLICT(id) DO UPDATE SET info = excluded.info", (video_id, data))
channel_id = video["channel_id"]
playlist_id = video.get("playlist_id", None)
if playlist_id and channel_id != playlist_id:
cursor.execute("INSERT INTO videos_in_playlists (video_id, playlist_id, name) VALUES (?, ?, ?) ON CONFLICT(video_id, playlist_id) DO NOTHING", (video_id, playlist_id, video["playlist_name"]))
conn.commit()
cursor.execute("SELECT DISTINCT playlist_id FROM videos_in_playlists WHERE video_id = ?", (video_id,))
rows = cursor.fetchall()
playlist_ids = [row[0] for row in rows]
conn.close()
embeddings = embedding_model.encode(video["fulltitle"])
document = {
"id": video["id"],
"fulltitle": video["fulltitle"],
"title_vec": embeddings.tolist(),
"description": video["description"],
"channel": video["channel"],
"channel_follower_count": video["channel_follower_count"],
"channel_id": video["channel_id"],
"duration": video["duration"],
"view_count": video["view_count"],
"upload_date": int(video["upload_date"]),
"filesize_approx": video["filesize_approx"],
"extractor": video["extractor"],
"thumbnail": video["thumbnail"],
"status": "DISCOVERED",
"requested_by": "",
"playlist_ids": playlist_ids,
}
ts_client.collections["discovered_videos"].documents.upsert(document)