#!/usr/bin/env python3
"""
CRLP Multi-URL Compare Probe v0.1

Compare multiple authorized URLs that should represent the same source/build.
Observation only. No purge. No destructive request.
"""

from __future__ import annotations

import argparse
import hashlib
import json
import time
import urllib.parse
import urllib.request
from datetime import datetime, timezone
from typing import Any, Dict, List


WATCH_HEADERS = [
    "cache-control",
    "cdn-cache-control",
    "vercel-cdn-cache-control",
    "x-vercel-cache",
    "x-vercel-id",
    "etag",
    "last-modified",
    "age",
    "date",
    "server",
    "content-type",
]


def utc_now() -> str:
    return datetime.now(timezone.utc).isoformat()


def add_cache_buster(url: str, token: str) -> str:
    parsed = urllib.parse.urlparse(url)
    query = urllib.parse.parse_qsl(parsed.query, keep_blank_values=True)
    query.append(("crlp_probe", token))
    return urllib.parse.urlunparse(parsed._replace(query=urllib.parse.urlencode(query)))


def decode_body(body: bytes, content_type: str) -> str:
    charset = "utf-8"
    lowered = (content_type or "").lower()
    if "charset=" in lowered:
        charset = lowered.split("charset=", 1)[1].split(";", 1)[0].strip()
    try:
        return body.decode(charset, errors="replace")
    except LookupError:
        return body.decode("utf-8", errors="replace")


def fetch(url: str, label: str, markers: List[str], timeout: int = 20) -> Dict[str, Any]:
    req = urllib.request.Request(
        url,
        headers={
            "User-Agent": "VEHICLE-CRLP-MultiURLProbe/0.1",
            "Cache-Control": "no-cache",
            "Pragma": "no-cache",
        },
        method="GET",
    )
    ts = utc_now()
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        body = resp.read()
        headers = {k.lower(): v for k, v in resp.headers.items()}
        selected = {h: headers.get(h, "") for h in WATCH_HEADERS}
        text = decode_body(body, selected.get("content-type", ""))
        return {
            "label": label,
            "request_timestamp_utc": ts,
            "url_requested": url,
            "final_url": resp.geturl(),
            "http_status": resp.status,
            "content_hash_sha256": hashlib.sha256(body).hexdigest(),
            "content_length": len(body),
            "headers": selected,
            "marker_results": {marker: (marker in text) for marker in markers},
        }


def main() -> None:
    parser = argparse.ArgumentParser(description="CRLP multi-URL compare probe v0.1")
    parser.add_argument("--url", action="append", required=True, help="URL to compare. Use multiple --url entries.")
    parser.add_argument("--label", action="append", default=[], help="Optional label for each URL, same order as --url.")
    parser.add_argument("--marker", action="append", default=[], help="Marker to search. Can be repeated.")
    parser.add_argument("--out", default="live_tc_003_multi_url_compare.json")
    parser.add_argument("--probe", action="store_true", help="Also fetch cache-busting probe variant for each URL.")
    parser.add_argument("--sleep", type=float, default=1.0)
    args = parser.parse_args()

    token = str(int(time.time()))
    observations: List[Dict[str, Any]] = []

    for i, url in enumerate(args.url):
        label = args.label[i] if i < len(args.label) else f"url_{i+1}"
        observations.append(fetch(url, label, args.marker))
        time.sleep(args.sleep)

        if args.probe:
            probe_url = add_cache_buster(url, token)
            observations.append(fetch(probe_url, f"{label}_crlp_probe", args.marker))
            time.sleep(args.sleep)

    hashes = [obs["content_hash_sha256"] for obs in observations]
    etags = [obs["headers"].get("etag", "") for obs in observations]
    marker_summary = {
        marker: all(obs["marker_results"].get(marker, False) for obs in observations)
        for marker in args.marker
    }

    report = {
        "protocol": "E.I.A.R.(V)-AERiV / CRLP",
        "probe": "CRLP Multi-URL Compare Probe v0.1",
        "generated_at_utc": utc_now(),
        "urls": args.url,
        "markers_requested": args.marker,
        "observations": observations,
        "comparison": {
            "all_hashes_equal": len(set(hashes)) == 1,
            "unique_hashes": sorted(set(hashes)),
            "all_etags_equal": len(set(etags)) == 1,
            "unique_etags": sorted(set(etags)),
            "x_vercel_cache_values": [obs["headers"].get("x-vercel-cache", "") for obs in observations],
            "all_markers_present_in_all_reads": marker_summary,
        },
        "safety_note": "Observation only. No purge, no destructive request, authorized sources only.",
    }

    with open(args.out, "w", encoding="utf-8") as f:
        json.dump(report, f, indent=2, ensure_ascii=False)

    print(json.dumps({
        "output": args.out,
        "all_hashes_equal": report["comparison"]["all_hashes_equal"],
        "unique_hashes": report["comparison"]["unique_hashes"],
        "all_etags_equal": report["comparison"]["all_etags_equal"],
        "x_vercel_cache_values": report["comparison"]["x_vercel_cache_values"],
        "all_markers_present_in_all_reads": report["comparison"]["all_markers_present_in_all_reads"],
    }, indent=2, ensure_ascii=False))


if __name__ == "__main__":
    main()
