import pandas as pd
import requests
import time
import sqlite3

# Input and Output file paths
input_file = "extracted_urls.xlsx"
output_file = "urls_status.xlsx"

# Load URLs
df = pd.read_excel(input_file)

# Add a new column for Status
df["Status"] = ""

print("🔍 Checking each URL...")

# ------------------------------------------------------------
# Connect to database and load valid project IDs
# ------------------------------------------------------------
DB_PATH = "pcm_projects.db"

conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

cursor.execute("SELECT id FROM projects")
valid_ids = {row[0] for row in cursor.fetchall()}
conn.close()

# ------------------------------------------------------------
# FIXED PART → use project_id instead of id
# ------------------------------------------------------------
invalid_id_rows = df[~df["project_id"].isin(valid_ids)]

if not invalid_id_rows.empty:
    print(f"⚠️ Removing {len(invalid_id_rows)} rows due to missing project IDs")

df = df[df["project_id"].isin(valid_ids)].reset_index(drop=True)

# ------------------------------------------------------------
# Persistent Session with better headers
session = requests.Session()
session.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/131.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
})
# ------------------------------------------------------------

# Tuned timeouts: generous connect, reasonable read
# REQUEST_TIMEOUT = (25, 60)  # connect 25s, read 60s
REQUEST_TIMEOUT = (40, 90)  # connect 25s, read 60s

for i, row in df.iterrows():
    url = str(row["url"]).strip()

    print(f"\n🔍 Processing: {url}")

    if not url or not url.startswith(("http://", "https://")):
        df.at[i, "Status"] = "Invalid URL ⚠️"
        print(f"➡️ Status: {df.at[i, 'Status']}")
        continue

    response = None

    try:
        for attempt in range(2):
            try:
                # Always GET + stream → most compatible
                response = session.get(
                    url,
                    timeout=REQUEST_TIMEOUT,
                    stream=True,
                    allow_redirects=True
                )

                status_code = response.status_code

                # Optional light content check (only if text/*)
                text_sample = ""
                try:
                    content_type = response.headers.get("content-type", "").lower()
                    if "text/" in content_type or "html" in content_type:
                        text_sample = next(response.iter_content(300), b"").decode("utf-8", errors="ignore").lower()

                    if not text_sample.strip():
                        df.at[i, "Status"] = "Not Working ❌ (Empty Page)"
                        break

                    error_keywords = ["not found", "404", "error", "invalid", "image removed"]
                    if any(k in text_sample for k in error_keywords):
                        df.at[i, "Status"] = "Not Working ❌ (Page Error)"
                        break

                except Exception:
                    pass  # can't read → ignore, trust status code

                break  # success

            except (requests.exceptions.ConnectionError, ConnectionResetError):
                if attempt == 0:
                    time.sleep(2)  # short backoff
                    continue
                else:
                    raise

        # Evaluate status
        if status_code < 400:
            df.at[i, "Status"] = "Working ✅"
        elif status_code in (401, 403):
            df.at[i, "Status"] = f"Accessible 🔒 (Auth Required, Code: {status_code})"
        else:
            df.at[i, "Status"] = f"Not Working ❌ (Code: {status_code})"

    except requests.exceptions.Timeout:
        df.at[i, "Status"] = "Not Working ❌ (Timeout ⏱️)"

    except requests.exceptions.TooManyRedirects:
        df.at[i, "Status"] = "Not Working ❌ (Too Many Redirects 🔁)"

    except requests.exceptions.SSLError:
        df.at[i, "Status"] = "Working ⚠️ (SSL Error but reachable)"

    except requests.exceptions.ConnectionError as e:
        error_str = str(e).lower()
        # Much stricter DNS detection — only these very specific phrases
        dns_indicators = [
            "name or service not known",
            "nodename nor servname provided, or not known",
            "temporary failure in name resolution",
            "[errno -2]", "[errno 8]",  # common getaddrinfo failures
            "failed to resolve",
            "could not resolve host"
        ]
        if any(ind in error_str for ind in dns_indicators):
            df.at[i, "Status"] = "Not Working ❌ (DNS Error / Domain Not Found)"
        else:
            # Most false-positive cases fall here (socket errors, resets, etc.)
            df.at[i, "Status"] = "Not Working ❌ (Connection Failed)"

    except Exception as e:
        df.at[i, "Status"] = f"Not Working ❌ ({str(e)[:60]})"

    finally:
        if response is not None:
            try:
                response.close()
            except:
                pass

    print(f"➡️ Status: {df.at[i, 'Status']}")

# Save results
df.to_excel(output_file, index=False)

print(f"\n✅ URL check completed. Results saved in '{output_file}'")