#!/usr/bin/env python3
"""
outline-import.py — Import NMC content into Outline KB
Imports: leah-kb.html → "Leah KB" collection, complaint SOPs → "Internal Ops"

Usage:
  python3 scripts/outline-import.py --collection "Leah KB"
  python3 scripts/outline-import.py --collection "Internal Ops"
  python3 scripts/outline-import.py --all
  python3 scripts/outline-import.py --dry-run
"""

import sys
import os
import re
import json
import argparse
import urllib.request
import urllib.error

HOME = os.path.expanduser("~")
WORKSPACE = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))


def get_config():
    token_path = os.path.join(HOME, ".openclaw/secrets/outline-api-token.txt")
    if os.path.exists(token_path):
        token = open(token_path).read().strip()
    else:
        token = os.environ.get("OUTLINE_API_TOKEN", "")
    url = os.environ.get("OUTLINE_API_URL", "http://localhost:3300/api")
    return token, url


def api_post(endpoint, payload, token, base_url):
    url = f"{base_url}/{endpoint.lstrip('/')}"
    data = json.dumps(payload).encode("utf-8")
    req = urllib.request.Request(
        url, data=data,
        headers={"Content-Type": "application/json", "Authorization": f"Bearer {token}"},
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            return json.loads(resp.read())
    except urllib.error.HTTPError as e:
        print(f"HTTP {e.code}: {e.read().decode()[:200]}", file=sys.stderr)
        return None
    except urllib.error.URLError as e:
        print(f"Connection error: {e.reason}", file=sys.stderr)
        return None


def get_collection_id(name, token, base_url):
    result = api_post("collections.list", {"limit": 25}, token, base_url)
    if not result or not result.get("ok"):
        return None
    for c in result.get("data", {}).get("data", []):
        if c["name"].lower() == name.lower():
            return c["id"]
    return None


def html_to_markdown(html):
    """Basic HTML → Markdown conversion (no external deps)."""
    text = html

    # Remove script/style tags
    text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Headers
    for i in range(6, 0, -1):
        text = re.sub(rf'<h{i}[^>]*>(.*?)</h{i}>', lambda m, n=i: f"\n{'#'*n} {m.group(1).strip()}\n", text, flags=re.IGNORECASE | re.DOTALL)

    # Bold/italic
    text = re.sub(r'<strong[^>]*>(.*?)</strong>', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'<b[^>]*>(.*?)</b>', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'<em[^>]*>(.*?)</em>', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'<i[^>]*>(.*?)</i>', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL)

    # Lists
    text = re.sub(r'<li[^>]*>(.*?)</li>', r'\n- \1', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'<[ou]l[^>]*>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'</[ou]l>', '\n', text, flags=re.IGNORECASE)

    # Line breaks & paragraphs
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'<p[^>]*>', '\n\n', text, flags=re.IGNORECASE)
    text = re.sub(r'</p>', '', text, flags=re.IGNORECASE)
    text = re.sub(r'<div[^>]*>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'</div>', '\n', text, flags=re.IGNORECASE)

    # Tables (rough)
    text = re.sub(r'<th[^>]*>(.*?)</th>', r'| **\1** ', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'<td[^>]*>(.*?)</td>', r'| \1 ', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'</tr>', '|\n', text, flags=re.IGNORECASE)
    text = re.sub(r'<tr[^>]*>', '', text, flags=re.IGNORECASE)
    text = re.sub(r'<t(?:head|body|foot)[^>]*>', '', text, flags=re.IGNORECASE)
    text = re.sub(r'</t(?:head|body|foot)>', '', text, flags=re.IGNORECASE)
    text = re.sub(r'<table[^>]*>', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'</table>', '\n', text, flags=re.IGNORECASE)

    # Strip remaining tags
    text = re.sub(r'<[^>]+>', '', text)

    # HTML entities
    text = text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>') \
               .replace('&nbsp;', ' ').replace('&mdash;', '-').replace('&ndash;', '-') \
               .replace('&#39;', "'").replace('&quot;', '"').replace('&ldquo;', '"') \
               .replace('&rdquo;', '"').replace('&lsquo;', "'").replace('&rsquo;', "'")

    # Clean up whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]+\n', '\n', text)
    text = text.strip()
    return text


def extract_sections_from_html(html):
    """Extract h2/h3 sections from HTML as separate documents."""
    # Find all section boundaries
    pattern = re.compile(r'<h([23])[^>]*>(.*?)</h\1>', re.IGNORECASE | re.DOTALL)
    matches = list(pattern.finditer(html))

    if not matches:
        # No clear sections — return whole thing as one doc
        return [{"title": "Knowledge Base", "content": html_to_markdown(html)}]

    sections = []
    for i, match in enumerate(matches):
        level = int(match.group(1))
        title = re.sub(r'<[^>]+>', '', match.group(2)).strip()

        # Content = from this heading to the next
        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(html)
        content_html = html[start:end].strip()
        content_md = html_to_markdown(content_html).strip()

        if title and (content_md or level == 2):
            sections.append({"title": title, "level": level, "content": content_md})

    return sections


def import_leah_kb(token, base_url, dry_run=False):
    """Import leah-kb.html into Leah KB collection."""
    kb_path = os.path.join(HOME, "nmc-phone/leah-kb.html")
    if not os.path.exists(kb_path):
        print(f"leah-kb.html not found at {kb_path}")
        print("Trying workspace fallback...")
        kb_path = os.path.join(WORKSPACE, "leah-kb.html")
        if not os.path.exists(kb_path):
            print("leah-kb.html not found. Skipping.")
            return

    print(f"\n📚 Importing Leah KB from {kb_path}")
    with open(kb_path, encoding="utf-8", errors="replace") as f:
        html = f.read()

    sections = extract_sections_from_html(html)
    print(f"  Found {len(sections)} section(s) to import")

    if dry_run:
        for s in sections:
            print(f"  [DRY RUN] Would create: '{s['title']}' ({len(s['content'])} chars)")
        return

    cid = get_collection_id("Leah KB", token, base_url)
    if not cid:
        print("  'Leah KB' collection not found in Outline. Create it first.")
        return

    # Create a parent doc for the import
    parent = api_post("documents.create", {
        "title": "Leah Knowledge Base (Imported)",
        "text": "Imported from leah-kb.html. This is the canonical KB for Leah's runtime context.",
        "collectionId": cid,
        "publish": True,
    }, token, base_url)

    parent_id = parent.get("data", {}).get("id") if parent and parent.get("ok") else None

    for s in sections:
        result = api_post("documents.create", {
            "title": s["title"],
            "text": s["content"],
            "collectionId": cid,
            "parentDocumentId": parent_id,
            "publish": True,
        }, token, base_url)
        if result and result.get("ok"):
            print(f"  ✅ Created: {s['title']}")
        else:
            print(f"  ❌ Failed: {s['title']}")


def import_complaint_sops(token, base_url, dry_run=False):
    """Import complaint handling SOPs into Internal Ops collection."""
    sop_path = os.path.join(WORKSPACE, "sops/complaint-handling-sop.md")
    if not os.path.exists(sop_path):
        print(f"Complaint SOP not found at {sop_path}. Skipping.")
        return

    print(f"\n📋 Importing Complaint SOPs from {sop_path}")
    with open(sop_path, encoding="utf-8") as f:
        content = f.read()

    if dry_run:
        print(f"  [DRY RUN] Would create: 'Complaint Handling SOP' ({len(content)} chars) in Internal Ops")
        return

    cid = get_collection_id("Internal Ops", token, base_url)
    if not cid:
        print("  'Internal Ops' collection not found. Create it first.")
        return

    result = api_post("documents.create", {
        "title": "Complaint Handling SOP",
        "text": content,
        "collectionId": cid,
        "publish": True,
    }, token, base_url)

    if result and result.get("ok"):
        print(f"  ✅ Created: Complaint Handling SOP")
    else:
        print(f"  ❌ Failed: {result}")


def import_contractor_sops(token, base_url, dry_run=False):
    """Import contractor scheduling SOPs into Internal Ops."""
    sop_path = os.path.join(WORKSPACE, "docs/contractor-scheduling-sops.md")
    if not os.path.exists(sop_path):
        print(f"Contractor SOPs not found at {sop_path}. Skipping.")
        return

    print(f"\n📋 Importing Contractor Scheduling SOPs")
    with open(sop_path, encoding="utf-8") as f:
        content = f.read()

    if dry_run:
        print(f"  [DRY RUN] Would create: 'Contractor Scheduling SOPs' ({len(content)} chars)")
        return

    cid = get_collection_id("Internal Ops", token, base_url)
    if not cid:
        print("  'Internal Ops' collection not found.")
        return

    result = api_post("documents.create", {
        "title": "Contractor Scheduling SOPs",
        "text": content,
        "collectionId": cid,
        "publish": True,
    }, token, base_url)

    if result and result.get("ok"):
        print(f"  ✅ Created: Contractor Scheduling SOPs")
    else:
        print(f"  ❌ Failed: {result}")


def main():
    parser = argparse.ArgumentParser(description="Import NMC content into Outline KB")
    parser.add_argument("--collection", choices=["Leah KB", "Internal Ops"], help="Import to specific collection")
    parser.add_argument("--all", action="store_true", help="Import all content")
    parser.add_argument("--dry-run", action="store_true", help="Preview without creating")
    args = parser.parse_args()

    token, base_url = get_config()
    if not token:
        print("No API token. See outline/setup-runbook.md Step 7.")
        sys.exit(1)

    if args.dry_run:
        print("🔍 DRY RUN — no documents will be created\n")

    if args.all or args.collection == "Leah KB":
        import_leah_kb(token, base_url, dry_run=args.dry_run)

    if args.all or args.collection == "Internal Ops":
        import_complaint_sops(token, base_url, dry_run=args.dry_run)
        import_contractor_sops(token, base_url, dry_run=args.dry_run)

    if not args.all and not args.collection:
        parser.print_help()


if __name__ == "__main__":
    main()
