#!/usr/bin/env python3
"""
CRLP Live Fetch Probe v0.2

Observation-only probe for authorized CRLP live cache experiments.

New in v0.2:
- Searches markers in fetched HTML/body.
- Optionally saves response bodies.
- Keeps v0.1 sequence: normal read, second normal read, cache-bypass read.
- No purge, no modification, no destructive actions.
"""

from __future__ import annotations

import argparse
import hashlib
import json
import time
import urllib.parse
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List


WATCH_HEADERS = [
    "cache-control",
    "cdn-cache-control",
    "vercel-cdn-cache-control",
    "x-vercel-cache",
    "x-vercel-id",
    "etag",
    "last-modified",
    "age",
    "date",
    "server",
    "content-type",
]


def utc_now() -> str:
    return datetime.now(timezone.utc).isoformat()


def add_cache_buster(url: str, token: str) -> str:
    parsed = urllib.parse.urlparse(url)
    query = urllib.parse.parse_qsl(parsed.query, keep_blank_values=True)
    query.append(("crlp_probe", token))
    new_query = urllib.parse.urlencode(query)
    return urllib.parse.urlunparse(parsed._replace(query=new_query))


def decode_body(body: bytes, content_type: str) -> str:
    charset = "utf-8"
    lowered = (content_type or "").lower()
    if "charset=" in lowered:
        charset = lowered.split("charset=", 1)[1].split(";", 1)[0].strip()
    try:
        return body.decode(charset, errors="replace")
    except LookupError:
        return body.decode("utf-8", errors="replace")


def marker_results(text: str, markers: List[str]) -> Dict[str, bool]:
    return {marker: (marker in text) for marker in markers}


def fetch(url: str, label: str, markers: List[str], timeout: int = 20) -> Dict[str, Any]:
    req = urllib.request.Request(
        url,
        headers={
            "User-Agent": "VEHICLE-CRLP-LiveProbe/0.2",
            "Cache-Control": "no-cache",
            "Pragma": "no-cache",
        },
        method="GET",
    )

    ts = utc_now()
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        body = resp.read()
        headers = {k.lower(): v for k, v in resp.headers.items()}
        selected = {h: headers.get(h, "") for h in WATCH_HEADERS}
        content_type = selected.get("content-type", "")
        text = decode_body(body, content_type)

        return {
            "label": label,
            "request_timestamp_utc": ts,
            "url_requested": url,
            "final_url": resp.geturl(),
            "http_status": resp.status,
            "content_hash_sha256": hashlib.sha256(body).hexdigest(),
            "content_length": len(body),
            "headers": selected,
            "marker_results": marker_results(text, markers),
            "_body_text": text,
        }


def strip_body(obs: Dict[str, Any]) -> Dict[str, Any]:
    clean = dict(obs)
    clean.pop("_body_text", None)
    return clean


def save_body_files(observations: List[Dict[str, Any]], prefix: str) -> List[str]:
    paths: List[str] = []
    for obs in observations:
        path = Path(f"{prefix}_{obs['label']}.html")
        path.write_text(obs.get("_body_text", ""), encoding="utf-8")
        paths.append(str(path))
    return paths


def main() -> None:
    parser = argparse.ArgumentParser(description="CRLP live fetch probe v0.2")
    parser.add_argument("url", help="Controlled URL to observe")
    parser.add_argument("--out", default="crlp_live_observation_v0_2.json")
    parser.add_argument("--sleep", type=float, default=2.0)
    parser.add_argument("--marker", action="append", default=[], help="Marker text to search in response body. Can be repeated.")
    parser.add_argument("--save-body", action="store_true", help="Save fetched HTML/body files for inspection.")
    parser.add_argument("--body-prefix", default="crlp_body", help="Prefix for saved body files.")
    args = parser.parse_args()

    token = str(int(time.time()))
    bypass_url = add_cache_buster(args.url, token)

    observations: List[Dict[str, Any]] = []
    observations.append(fetch(args.url, "first_normal_read", args.marker))
    time.sleep(args.sleep)
    observations.append(fetch(args.url, "second_normal_read", args.marker))
    time.sleep(args.sleep)
    observations.append(fetch(bypass_url, "cache_bypass_read", args.marker))

    saved_body_files: List[str] = []
    if args.save_body:
        saved_body_files = save_body_files(observations, args.body_prefix)

    clean_observations = [strip_body(obs) for obs in observations]

    all_marker_results = {
        marker: all(obs["marker_results"].get(marker, False) for obs in observations)
        for marker in args.marker
    }

    report = {
        "protocol": "E.I.A.R.(V)-AERiV / CRLP",
        "probe": "CRLP Live Fetch Probe v0.2",
        "generated_at_utc": utc_now(),
        "source_url": args.url,
        "cache_bypass_url": bypass_url,
        "markers_requested": args.marker,
        "observations": clean_observations,
        "comparison": {
            "first_vs_second_hash_equal": observations[0]["content_hash_sha256"] == observations[1]["content_hash_sha256"],
            "normal_vs_bypass_hash_equal": observations[1]["content_hash_sha256"] == observations[2]["content_hash_sha256"],
            "all_hashes_equal": (
                observations[0]["content_hash_sha256"] == observations[1]["content_hash_sha256"] == observations[2]["content_hash_sha256"]
            ),
            "observed_x_vercel_cache_values": [
                obs["headers"].get("x-vercel-cache", "") for obs in observations
            ],
            "all_markers_present_in_all_reads": all_marker_results,
        },
        "saved_body_files": saved_body_files,
        "safety_note": "Observation only. No purge, no destructive request, authorized sources only.",
    }

    with open(args.out, "w", encoding="utf-8") as f:
        json.dump(report, f, indent=2, ensure_ascii=False)

    print(json.dumps({
        "output": args.out,
        "first_vs_second_hash_equal": report["comparison"]["first_vs_second_hash_equal"],
        "normal_vs_bypass_hash_equal": report["comparison"]["normal_vs_bypass_hash_equal"],
        "all_hashes_equal": report["comparison"]["all_hashes_equal"],
        "x_vercel_cache_values": report["comparison"]["observed_x_vercel_cache_values"],
        "all_markers_present_in_all_reads": report["comparison"]["all_markers_present_in_all_reads"],
        "saved_body_files": saved_body_files,
    }, indent=2, ensure_ascii=False))


if __name__ == "__main__":
    main()
