import datetime
|
|
import subprocess
|
|
import json
|
|
import sqlite3
|
|
|
|
import typesense
|
|
from sentence_transformers import SentenceTransformer
|
|
from dotenv import dotenv_values
|
|
|
|
config = dotenv_values(".env")
|
|
|
|
ts_client = typesense.Client({
|
|
'api_key': config["TYPESENSE_API_KEY"],
|
|
'nodes': [{
|
|
'host': 'typesense',
|
|
'port': '8108',
|
|
'protocol': 'http'
|
|
}],
|
|
'connection_timeout_seconds': 2
|
|
})
|
|
|
|
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2", cache_folder="/data")
|
|
|
|
def fetch_or_create_search_key():
|
|
conn = sqlite3.connect("/data/database.sqlite3")
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("SELECT key FROM typesense_keys LIMIT 1")
|
|
|
|
rows = cursor.fetchall()
|
|
if len(rows) == 0:
|
|
key = ts_client.keys.create({"description": "Search-only key.", "actions": ["documents:search"], "collections": ["*"]})
|
|
key = key["value"]
|
|
|
|
cursor.execute("INSERT INTO typesense_keys (key) VALUES (?)", (key,))
|
|
conn.commit()
|
|
else:
|
|
key = rows[0][0]
|
|
|
|
conn.close()
|
|
|
|
return key
|
|
|
|
|
|
def get_video_json(video_id):
|
|
conn = sqlite3.connect("/data/database.sqlite3")
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("""SELECT info, status FROM discovered_videos
|
|
WHERE id = ? LIMIT 1""", (video_id,))
|
|
result = cursor.fetchall()[0]
|
|
data = json.loads(result[0])
|
|
data["status"] = result[1]
|
|
conn.close()
|
|
return data
|
|
|
|
def hide_videos(video_ids):
|
|
conn = sqlite3.connect("/data/database.sqlite3")
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute(f"""UPDATE OR IGNORE discovered_videos
|
|
SET status = 'HIDDEN'
|
|
WHERE status = 'DISCOVERED'
|
|
AND id IN ({','.join(['?'] * len(video_ids))})
|
|
RETURNING status""", video_ids)
|
|
new_status = cursor.fetchall()[0][0]
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
for video_id in video_ids:
|
|
doc = {
|
|
"id": video_id,
|
|
"status": "HIDDEN"
|
|
}
|
|
|
|
ts_client.collections["discovered_videos"].documents[video_id].update(doc)
|
|
|
|
return new_status
|
|
|
|
|
|
def request_videos(video_ids):
|
|
conn = sqlite3.connect("/data/database.sqlite3")
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute(f"""UPDATE OR ABORT discovered_videos
|
|
SET status = 'REQUESTED'
|
|
WHERE status = 'DISCOVERED'
|
|
AND id IN ({','.join(['?'] * len(video_ids))})
|
|
RETURNING status""", video_ids)
|
|
new_status = cursor.fetchall()[0][0]
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
for video_id in video_ids:
|
|
doc = {
|
|
"id": video_id,
|
|
"status": "REQUESTED"
|
|
}
|
|
|
|
ts_client.collections["discovered_videos"].documents[video_id].update(doc)
|
|
|
|
return new_status
|
|
|
|
def download_video(video_id):
|
|
conn = sqlite3.connect("/data/database.sqlite3")
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("""UPDATE OR FAIL discovered_videos
|
|
set status = 'DOWNLOADING'
|
|
where status = 'REQUESTED'
|
|
and id = ?
|
|
returning json_extract(info, '$.original_url')""", (video_id,))
|
|
original_url = cursor.fetchall()[0][0]
|
|
conn.commit()
|
|
|
|
doc = {
|
|
"id": video_id,
|
|
"status": "DOWNLOADING"
|
|
}
|
|
|
|
ts_client.collections["discovered_videos"].documents[video_id].update(doc)
|
|
|
|
command = [
|
|
"/app/bin/yt-dlp",
|
|
"-S",
|
|
"res:1080",
|
|
"-P",
|
|
"/data",
|
|
"-o",
|
|
"%(extractor)s-store/%(channel)s - %(channel_id)s/%(id)s/%(title)s.%(ext)s",
|
|
"--write-subs",
|
|
"--write-auto-subs",
|
|
"--sub-langs",
|
|
"\"en-US,en,en-us,en-gb,en-GB\"",
|
|
"--write-thumbnail",
|
|
"--continue",
|
|
"--embed-chapters",
|
|
"--embed-subs",
|
|
original_url
|
|
]
|
|
returncode = subprocess.run(command).returncode
|
|
|
|
if returncode != 0:
|
|
cursor.execute("""UPDATE OR FAIL discovered_videos
|
|
SET status = 'REQUESTED'
|
|
WHERE status = 'DOWNLOADING'
|
|
AND id = ?""", (video_id,))
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
doc = {
|
|
"id": video_id,
|
|
"status": "REQUESTED"
|
|
}
|
|
|
|
ts_client.collections["discovered_videos"].documents[video_id].update(doc)
|
|
|
|
raise Exception(f"Download failed for URL: {original_url}")
|
|
return
|
|
|
|
cursor.execute("""UPDATE OR FAIL discovered_videos
|
|
SET status = 'DOWNLOADED'
|
|
WHERE status = 'DOWNLOADING'
|
|
AND id = ?""", (video_id,))
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
doc = {
|
|
"id": video_id,
|
|
"status": "DOWNLOADED"
|
|
}
|
|
|
|
ts_client.collections["discovered_videos"].documents[video_id].update(doc)
|
|
|
|
|
|
def scrape_url(url):
|
|
command = [
|
|
"/app/bin/yt-dlp",
|
|
"--dump-json",
|
|
"--write-subs",
|
|
"--sponsorblock-mark",
|
|
"all",
|
|
"--sub-langs",
|
|
"\"en-US,en,en-us,en-gb,en-GB\"",
|
|
url
|
|
]
|
|
output = subprocess.check_output(command).decode("utf-8")
|
|
|
|
for line in output.splitlines():
|
|
video = json.loads(line)
|
|
del video["formats"]
|
|
del video["requested_formats"]
|
|
del video["thumbnails"]
|
|
del video["automatic_captions"]
|
|
|
|
data = json.dumps(video)
|
|
|
|
video_id = video["id"]
|
|
|
|
conn = sqlite3.connect("/data/database.sqlite3")
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("INSERT INTO discovered_videos (id, info, status) VALUES (?, ?, 'DISCOVERED') ON CONFLICT(id) DO UPDATE SET info = excluded.info", (video_id, data))
|
|
|
|
channel_id = video["channel_id"]
|
|
playlist_id = video.get("playlist_id", None)
|
|
|
|
if playlist_id and channel_id != playlist_id:
|
|
cursor.execute("INSERT INTO videos_in_playlists (video_id, playlist_id, name) VALUES (?, ?, ?) ON CONFLICT(video_id, playlist_id) DO NOTHING", (video_id, playlist_id, video["playlist_name"]))
|
|
|
|
conn.commit()
|
|
|
|
cursor.execute("SELECT DISTINCT playlist_id FROM videos_in_playlists WHERE video_id = ?", (video_id,))
|
|
rows = cursor.fetchall()
|
|
playlist_ids = [row[0] for row in rows]
|
|
|
|
conn.close()
|
|
|
|
embeddings = embedding_model.encode(video["fulltitle"])
|
|
|
|
document = {
|
|
"id": video["id"],
|
|
"fulltitle": video["fulltitle"],
|
|
"title_vec": embeddings.tolist(),
|
|
"description": video["description"],
|
|
"channel": video["channel"],
|
|
"channel_follower_count": video["channel_follower_count"],
|
|
"channel_id": video["channel_id"],
|
|
"duration": video["duration"],
|
|
"view_count": video["view_count"],
|
|
"upload_date": int(video["upload_date"]),
|
|
"filesize_approx": video["filesize_approx"],
|
|
"extractor": video["extractor"],
|
|
"thumbnail": video["thumbnail"],
|
|
"status": "DISCOVERED",
|
|
"requested_by": "",
|
|
"playlist_ids": playlist_ids,
|
|
}
|
|
|
|
ts_client.collections["discovered_videos"].documents.upsert(document)
|