from youtube_transcript_api import YouTubeTranscriptApi
import re
import os

# Create a folder for the transcripts if it doesn't exist
OUTPUT_DIR = "transcripts"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

def extract_video_id(url):
    """Extracts the ID from a standard or shortened YouTube URL."""
    pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
    match = re.search(pattern, url)
    return match.group(1) if match else None

def get_batch_transcripts(url_list):
    for url in url_list:
        video_id = extract_video_id(url)
        if not video_id:
            print(f"Skipping invalid URL: {url}")
            continue

        try:
            print(f"Fetching: {video_id}...")
            api = YouTubeTranscriptApi()
            transcript_list = api.fetch(video_id)
            full_text = " ".join([item.text for item in transcript_list])

            filename = os.path.join(OUTPUT_DIR, f"{video_id}.txt")
            with open(filename, "w", encoding="utf-8") as f:
                f.write(full_text)
            print(f"Done: {filename}")
        except Exception as e:
            print(f"Failed to get {video_id}: {str(e)}")

if __name__ == "__main__":
    # BATCH 20 - More Videos (18 new)
    urls = [
        "https://www.youtube.com/watch?v=vqHBfe3r4OQ",
        "https://www.youtube.com/watch?v=lFGK0IvPaNc",
        "https://www.youtube.com/watch?v=fHGMA8hLNt8",
        "https://www.youtube.com/watch?v=4867-EkHWzM",
        "https://www.youtube.com/watch?v=421T2iWTQio",
        "https://www.youtube.com/watch?v=MZZCW179nKM",
        "https://www.youtube.com/watch?v=901VMcZq8X4",
        "https://www.youtube.com/watch?v=kS1MJFZWMq4",
        "https://www.youtube.com/watch?v=EwAd-fqQfJ8",
        "https://www.youtube.com/watch?v=MXxFErrmIp0",
        "https://www.youtube.com/watch?v=m-5DjcgFmfQ",
        "https://www.youtube.com/watch?v=4e0o6lGM2VE",
        "https://www.youtube.com/watch?v=fOxC44g8vig",
        "https://www.youtube.com/watch?v=wO8EboopboU",
        "https://www.youtube.com/watch?v=HCwfRe5EHGQ",
        "https://www.youtube.com/watch?v=m3jiIowIi5I",
        "https://www.youtube.com/watch?v=tTZ4yDY4adg",
    ]

    get_batch_transcripts(urls)
