import sqlite3
import json
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

DB_PATH = "pcm_projects.db"
OUTPUT_EXCEL = "extracted_urls.xlsx"
MAX_CHUNK_SIZE = 3000  # characters per GPT call


# -------------------------------------------------
# Create log table (if not exists)
# -------------------------------------------------
def init_log_table():
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS gpt_processing_log (
            id TEXT PRIMARY KEY,
            updatedAt TEXT NOT NULL
        )
    """)
    conn.commit()
    conn.close()


# -------------------------------------------------
# Fetch NEW or UPDATED rows only
# -------------------------------------------------
def fetch_data_for_gpt():
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    cursor.execute("""
        SELECT p.id, p.description, p.updatedAt, p.sub_descriptions
        FROM projects p
        LEFT JOIN gpt_processing_log g ON p.id = g.id
        WHERE g.id IS NULL OR p.updatedAt > g.updatedAt
    """)

    rows = cursor.fetchall()
    conn.close()
    return rows


# -------------------------------------------------
# Fetch all existing project IDs from DB
# (used to delete removed projects from Excel)
# -------------------------------------------------
def fetch_all_project_ids():
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("SELECT id FROM projects")
    rows = cursor.fetchall()
    conn.close()
    return {r[0] for r in rows}


# -------------------------------------------------
# Chunk text for GPT
# -------------------------------------------------
def chunk_text(text):
    chunks = []
    while len(text) > MAX_CHUNK_SIZE:
        split_pos = text.rfind(" ", 0, MAX_CHUNK_SIZE)
        if split_pos == -1:
            split_pos = MAX_CHUNK_SIZE
        chunks.append(text[:split_pos])
        text = text[split_pos:]
    chunks.append(text)
    return chunks


# -------------------------------------------------
# GPT CALL → extract URLs only
# -------------------------------------------------
def extract_urls_from_gpt(text_chunk):
    prompt = f"""
You are a highly accurate URL extractor.

RULES:
- Return ONLY valid URLs
- No explanation, no bullets, no extra text
- Do NOT guess or modify URLs

TEXT:
{text_chunk}
"""

    response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    return response.choices[0].message.content.strip().split("\n")


# -------------------------------------------------
# Update log after processing
# -------------------------------------------------
def update_log(project_id, updatedAt):
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("""
        INSERT OR REPLACE INTO gpt_processing_log (id, updatedAt)
        VALUES (?, ?)
    """, (project_id, updatedAt))
    conn.commit()
    conn.close()


# -------------------------------------------------
# Save to Excel → append → dedupe → remove deleted rows
# -------------------------------------------------
def update_excel(final_urls):
    # Load old sheet or create empty DF
    if os.path.exists(OUTPUT_EXCEL):
        old_df = pd.read_excel(OUTPUT_EXCEL)
    else:
        old_df = pd.DataFrame(columns=["project_id", "url"])

    new_df = pd.DataFrame(final_urls)

    # -------------------------------------------------------
    # ⭐ FIX: Remove old URLs for projects processed today
    # -------------------------------------------------------
    processed_ids = set(new_df["project_id"].astype(str))
    old_df = old_df[~old_df["project_id"].astype(str).isin(processed_ids)]
    # -------------------------------------------------------

    # Append new data
    combined = pd.concat([old_df, new_df], ignore_index=True)

    # Remove duplicates
    combined.drop_duplicates(subset=["project_id", "url"], inplace=True)

    # Remove URLs belonging to deleted project IDs
    existing_ids = fetch_all_project_ids()
    combined = combined[combined["project_id"].astype(str).isin(existing_ids)]

    combined.to_excel(OUTPUT_EXCEL, index=False)
    print(f"📁 Excel updated: {OUTPUT_EXCEL}")


# -------------------------------------------------
# MAIN PROCESS
# -------------------------------------------------
def process_all_projects_for_gpt():
    init_log_table()
    rows = fetch_data_for_gpt()

    if not rows:
        print("✔ No new or updated data found.")
        return

    final_urls = []

    for project_id, description, updatedAt, sub_desc in rows:
        print(f"🔍 Processing ID: {project_id}")

        sub_list = json.loads(sub_desc) if sub_desc else []
        full_text = description + "\n" + "\n".join(sub_list)

        chunks = chunk_text(full_text)

        collected_urls = []

        for chunk in chunks:
            urls = extract_urls_from_gpt(chunk)
            for u in urls:
                u = u.strip()
                if u.startswith("http://") or u.startswith("https://"):
                    collected_urls.append(u)

        collected_urls = list(set(collected_urls))

        for u in collected_urls:
            final_urls.append({
                "project_id": project_id,
                "url": u
            })

        update_log(project_id, updatedAt)
        print(f"✔ Completed ID {project_id} ({len(collected_urls)} URLs)")

    # Update Excel (append + dedupe + remove deleted IDs)
    update_excel(final_urls)

    print("🎉 Completed processing all new/updated projects.")


if __name__ == "__main__":
    process_all_projects_for_gpt()
