#!/usr/bin/env python3 """ outline-import.py — Import NMC content into Outline KB Imports: leah-kb.html → "Leah KB" collection, complaint SOPs → "Internal Ops" Usage: python3 scripts/outline-import.py --collection "Leah KB" python3 scripts/outline-import.py --collection "Internal Ops" python3 scripts/outline-import.py --all python3 scripts/outline-import.py --dry-run """ import sys import os import re import json import argparse import urllib.request import urllib.error HOME = os.path.expanduser("~") WORKSPACE = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) def get_config(): token_path = os.path.join(HOME, ".openclaw/secrets/outline-api-token.txt") if os.path.exists(token_path): token = open(token_path).read().strip() else: token = os.environ.get("OUTLINE_API_TOKEN", "") url = os.environ.get("OUTLINE_API_URL", "http://localhost:3300/api") return token, url def api_post(endpoint, payload, token, base_url): url = f"{base_url}/{endpoint.lstrip('/')}" data = json.dumps(payload).encode("utf-8") req = urllib.request.Request( url, data=data, headers={"Content-Type": "application/json", "Authorization": f"Bearer {token}"}, method="POST", ) try: with urllib.request.urlopen(req, timeout=15) as resp: return json.loads(resp.read()) except urllib.error.HTTPError as e: print(f"HTTP {e.code}: {e.read().decode()[:200]}", file=sys.stderr) return None except urllib.error.URLError as e: print(f"Connection error: {e.reason}", file=sys.stderr) return None def get_collection_id(name, token, base_url): result = api_post("collections.list", {"limit": 25}, token, base_url) if not result or not result.get("ok"): return None for c in result.get("data", {}).get("data", []): if c["name"].lower() == name.lower(): return c["id"] return None def html_to_markdown(html): """Basic HTML → Markdown conversion (no external deps).""" text = html # Remove script/style tags text = re.sub(r']*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r']*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) # Headers for i in range(6, 0, -1): text = re.sub(rf']*>(.*?)', lambda m, n=i: f"\n{'#'*n} {m.group(1).strip()}\n", text, flags=re.IGNORECASE | re.DOTALL) # Bold/italic text = re.sub(r']*>(.*?)', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL) text = re.sub(r']*>(.*?)', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL) text = re.sub(r']*>(.*?)', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL) text = re.sub(r']*>(.*?)', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL) # Lists text = re.sub(r']*>(.*?)', r'\n- \1', text, flags=re.IGNORECASE | re.DOTALL) text = re.sub(r'<[ou]l[^>]*>', '\n', text, flags=re.IGNORECASE) text = re.sub(r'', '\n', text, flags=re.IGNORECASE) # Line breaks & paragraphs text = re.sub(r'', '\n', text, flags=re.IGNORECASE) text = re.sub(r']*>', '\n\n', text, flags=re.IGNORECASE) text = re.sub(r'

', '', text, flags=re.IGNORECASE) text = re.sub(r']*>', '\n', text, flags=re.IGNORECASE) text = re.sub(r'', '\n', text, flags=re.IGNORECASE) # Tables (rough) text = re.sub(r']*>(.*?)', r'| **\1** ', text, flags=re.IGNORECASE | re.DOTALL) text = re.sub(r']*>(.*?)', r'| \1 ', text, flags=re.IGNORECASE | re.DOTALL) text = re.sub(r'', '|\n', text, flags=re.IGNORECASE) text = re.sub(r']*>', '', text, flags=re.IGNORECASE) text = re.sub(r']*>', '', text, flags=re.IGNORECASE) text = re.sub(r'', '', text, flags=re.IGNORECASE) text = re.sub(r']*>', '\n', text, flags=re.IGNORECASE) text = re.sub(r'', '\n', text, flags=re.IGNORECASE) # Strip remaining tags text = re.sub(r'<[^>]+>', '', text) # HTML entities text = text.replace('&', '&').replace('<', '<').replace('>', '>') \ .replace(' ', ' ').replace('—', '-').replace('–', '-') \ .replace(''', "'").replace('"', '"').replace('“', '"') \ .replace('”', '"').replace('‘', "'").replace('’', "'") # Clean up whitespace text = re.sub(r'\n{3,}', '\n\n', text) text = re.sub(r'[ \t]+\n', '\n', text) text = text.strip() return text def extract_sections_from_html(html): """Extract h2/h3 sections from HTML as separate documents.""" # Find all section boundaries pattern = re.compile(r']*>(.*?)', re.IGNORECASE | re.DOTALL) matches = list(pattern.finditer(html)) if not matches: # No clear sections — return whole thing as one doc return [{"title": "Knowledge Base", "content": html_to_markdown(html)}] sections = [] for i, match in enumerate(matches): level = int(match.group(1)) title = re.sub(r'<[^>]+>', '', match.group(2)).strip() # Content = from this heading to the next start = match.end() end = matches[i + 1].start() if i + 1 < len(matches) else len(html) content_html = html[start:end].strip() content_md = html_to_markdown(content_html).strip() if title and (content_md or level == 2): sections.append({"title": title, "level": level, "content": content_md}) return sections def import_leah_kb(token, base_url, dry_run=False): """Import leah-kb.html into Leah KB collection.""" kb_path = os.path.join(HOME, "nmc-phone/leah-kb.html") if not os.path.exists(kb_path): print(f"leah-kb.html not found at {kb_path}") print("Trying workspace fallback...") kb_path = os.path.join(WORKSPACE, "leah-kb.html") if not os.path.exists(kb_path): print("leah-kb.html not found. Skipping.") return print(f"\n📚 Importing Leah KB from {kb_path}") with open(kb_path, encoding="utf-8", errors="replace") as f: html = f.read() sections = extract_sections_from_html(html) print(f" Found {len(sections)} section(s) to import") if dry_run: for s in sections: print(f" [DRY RUN] Would create: '{s['title']}' ({len(s['content'])} chars)") return cid = get_collection_id("Leah KB", token, base_url) if not cid: print(" 'Leah KB' collection not found in Outline. Create it first.") return # Create a parent doc for the import parent = api_post("documents.create", { "title": "Leah Knowledge Base (Imported)", "text": "Imported from leah-kb.html. This is the canonical KB for Leah's runtime context.", "collectionId": cid, "publish": True, }, token, base_url) parent_id = parent.get("data", {}).get("id") if parent and parent.get("ok") else None for s in sections: result = api_post("documents.create", { "title": s["title"], "text": s["content"], "collectionId": cid, "parentDocumentId": parent_id, "publish": True, }, token, base_url) if result and result.get("ok"): print(f" ✅ Created: {s['title']}") else: print(f" ❌ Failed: {s['title']}") def import_complaint_sops(token, base_url, dry_run=False): """Import complaint handling SOPs into Internal Ops collection.""" sop_path = os.path.join(WORKSPACE, "sops/complaint-handling-sop.md") if not os.path.exists(sop_path): print(f"Complaint SOP not found at {sop_path}. Skipping.") return print(f"\n📋 Importing Complaint SOPs from {sop_path}") with open(sop_path, encoding="utf-8") as f: content = f.read() if dry_run: print(f" [DRY RUN] Would create: 'Complaint Handling SOP' ({len(content)} chars) in Internal Ops") return cid = get_collection_id("Internal Ops", token, base_url) if not cid: print(" 'Internal Ops' collection not found. Create it first.") return result = api_post("documents.create", { "title": "Complaint Handling SOP", "text": content, "collectionId": cid, "publish": True, }, token, base_url) if result and result.get("ok"): print(f" ✅ Created: Complaint Handling SOP") else: print(f" ❌ Failed: {result}") def import_contractor_sops(token, base_url, dry_run=False): """Import contractor scheduling SOPs into Internal Ops.""" sop_path = os.path.join(WORKSPACE, "docs/contractor-scheduling-sops.md") if not os.path.exists(sop_path): print(f"Contractor SOPs not found at {sop_path}. Skipping.") return print(f"\n📋 Importing Contractor Scheduling SOPs") with open(sop_path, encoding="utf-8") as f: content = f.read() if dry_run: print(f" [DRY RUN] Would create: 'Contractor Scheduling SOPs' ({len(content)} chars)") return cid = get_collection_id("Internal Ops", token, base_url) if not cid: print(" 'Internal Ops' collection not found.") return result = api_post("documents.create", { "title": "Contractor Scheduling SOPs", "text": content, "collectionId": cid, "publish": True, }, token, base_url) if result and result.get("ok"): print(f" ✅ Created: Contractor Scheduling SOPs") else: print(f" ❌ Failed: {result}") def main(): parser = argparse.ArgumentParser(description="Import NMC content into Outline KB") parser.add_argument("--collection", choices=["Leah KB", "Internal Ops"], help="Import to specific collection") parser.add_argument("--all", action="store_true", help="Import all content") parser.add_argument("--dry-run", action="store_true", help="Preview without creating") args = parser.parse_args() token, base_url = get_config() if not token: print("No API token. See outline/setup-runbook.md Step 7.") sys.exit(1) if args.dry_run: print("🔍 DRY RUN — no documents will be created\n") if args.all or args.collection == "Leah KB": import_leah_kb(token, base_url, dry_run=args.dry_run) if args.all or args.collection == "Internal Ops": import_complaint_sops(token, base_url, dry_run=args.dry_run) import_contractor_sops(token, base_url, dry_run=args.dry_run) if not args.all and not args.collection: parser.print_help() if __name__ == "__main__": main()