#!/usr/bin/env python3
"""Clawvard software-copyright format-compliance patch.

The upstream Fokkyp/SoftwareCopyright-Skill (MIT) emits 3 .docx + 1 .txt + a
build report. The Chinese NCAC software-copyright registration spec actually
asks for:

  * 宋体（SimSun）小四（10.5pt） code body — the upstream defaults to Consolas 7pt
  * Continuously visible line-number prefix (left margin) so reviewers can audit
  * For long projects, a 4th 代码(全部).docx full-source excerpt variant

This script reads the upstream's 正式资料/ directory, re-emits the code .docx
files with SimSun 10.5pt + a visible "<n> │ " line prefix on every line, and
adds the 4th 代码(全部).docx so the end deliverable is 4 .docx + 1 .txt.

Only depends on the public `python-docx` PyPI package.

Usage:
  pip install python-docx
  python3 format-compliance.py \
      --workdir 软件著作权申请资料 \
      --software-name "你的软件全称" \
      --version "V1.0"

Source: https://clawvard.school/skills/software-copyright/format-compliance.py
Course: https://clawvard.school/courses/software-copyright
License: MIT (matches upstream).
"""
from __future__ import annotations

import argparse
import io
import re
import shutil
import sys
import zipfile
from pathlib import Path

try:
    import docx
    from docx.shared import Pt, Cm, RGBColor
    from docx.oxml.ns import qn
    from docx.oxml import OxmlElement
    from docx.enum.text import WD_ALIGN_PARAGRAPH
except ImportError:
    sys.stderr.write(
        "[format-compliance] python-docx is required.\n"
        "  pip install python-docx\n"
    )
    sys.exit(2)

FONT_CN = "SimSun"
FONT_SIZE_PT = 10.5
DEFAULT_TOTAL_LINES = None  # auto-derived from the application form if available

# Deterministic-zip parameters. DOCX is an Office Open XML zip archive; every
# entry carries a 16-bit DOS time + dcterms:created/modified in core.xml. Both
# are populated with the wall clock by default, which is why two reproducible
# runs of an otherwise-identical pipeline produce different file fingerprints even
# when the .docx content is byte-identical. We rebuild every .docx in the
# output directory through a zip that pins every entry to a fixed timestamp
# and rewrites dcterms timestamps, so the fingerprint becomes a pure function of input.
EPOCH_DATE_TIME = (2026, 1, 1, 0, 0, 0)
EPOCH_ISO = "2026-01-01T00:00:00Z"

# ─────────────────────────────────────────────────────────────────────────────
# Low-level helpers
# ─────────────────────────────────────────────────────────────────────────────

def _set_run_font(run, size_pt: float = FONT_SIZE_PT, *, bold: bool = False, italic: bool = False) -> None:
    """Force a run to render as SimSun at `size_pt`, ascii + east-asian + cs."""
    run.font.name = FONT_CN
    run.font.size = Pt(size_pt)
    run.bold = bold
    run.italic = italic
    rPr = run._element.get_or_add_rPr()
    rFonts = rPr.find(qn("w:rFonts"))
    if rFonts is None:
        rFonts = OxmlElement("w:rFonts")
        rPr.insert(0, rFonts)
    for k in ("w:eastAsia", "w:ascii", "w:hAnsi", "w:cs"):
        rFonts.set(qn(k), FONT_CN)
    run.font.color.rgb = RGBColor(0, 0, 0)


def _normalize_style(doc) -> None:
    """Pin the Normal style to SimSun 10.5pt."""
    style = doc.styles["Normal"]
    style.font.name = FONT_CN
    style.font.size = Pt(FONT_SIZE_PT)
    rPr = style.element.get_or_add_rPr()
    rFonts = rPr.find(qn("w:rFonts"))
    if rFonts is None:
        rFonts = OxmlElement("w:rFonts")
        rPr.append(rFonts)
    rFonts.set(qn("w:eastAsia"), FONT_CN)
    rFonts.set(qn("w:ascii"), FONT_CN)
    rFonts.set(qn("w:hAnsi"), FONT_CN)


def _configure_section(section) -> None:
    section.page_height = Cm(29.7)
    section.page_width = Cm(21.0)
    section.top_margin = Cm(2.0)
    section.bottom_margin = Cm(2.0)
    section.left_margin = Cm(2.5)
    section.right_margin = Cm(2.0)
    section.header_distance = Cm(1.0)
    section.footer_distance = Cm(1.0)


def _add_header(section, software_name: str, version: str) -> None:
    """`<software-name> <version>    第 <PAGE> 页`."""
    header = section.header
    p = header.paragraphs[0] if header.paragraphs else header.add_paragraph()
    p.text = ""
    run = p.add_run(f"{software_name} {version}")
    _set_run_font(run, size_pt=9)
    _set_run_font(p.add_run("\t"), size_pt=9)
    _set_run_font(p.add_run("第 "), size_pt=9)
    fld = OxmlElement("w:fldSimple")
    fld.set(qn("w:instr"), "PAGE")
    r_inside = OxmlElement("w:r")
    rPr = OxmlElement("w:rPr")
    sz = OxmlElement("w:sz"); sz.set(qn("w:val"), "18"); rPr.append(sz)
    r_inside.append(rPr)
    t = OxmlElement("w:t"); t.text = "1"; r_inside.append(t)
    fld.append(r_inside)
    p._p.append(fld)
    _set_run_font(p.add_run(" 页"), size_pt=9)
    pPr = p._p.get_or_add_pPr()
    tabs = OxmlElement("w:tabs")
    tab = OxmlElement("w:tab")
    tab.set(qn("w:val"), "right")
    tab.set(qn("w:pos"), "8504")
    tabs.append(tab)
    pPr.append(tabs)


def _new_doc(software_name: str, version: str):
    doc = docx.Document()
    _configure_section(doc.sections[0])
    _add_header(doc.sections[0], software_name, version)
    _normalize_style(doc)
    return doc


def _add_title_block(doc, title: str, software_name: str, version: str) -> None:
    title_p = doc.add_paragraph()
    title_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    _set_run_font(title_p.add_run(title), size_pt=16, bold=True)
    sub_p = doc.add_paragraph()
    sub_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    _set_run_font(sub_p.add_run(f"{software_name} · 版本 {version}"), size_pt=12)
    doc.add_paragraph()


def _add_code_line(doc, n: int, code: str, width: int) -> None:
    para = doc.add_paragraph()
    pf = para.paragraph_format
    pf.space_before = Pt(0)
    pf.space_after = Pt(0)
    pf.line_spacing = 1.15
    _set_run_font(para.add_run(f"{n:>{width}d} │ "), size_pt=FONT_SIZE_PT)
    _set_run_font(para.add_run(code if code != "" else " "), size_pt=FONT_SIZE_PT)


# ─────────────────────────────────────────────────────────────────────────────
# Upstream-output discovery
# ─────────────────────────────────────────────────────────────────────────────

def _read_lines_from_docx(path: Path) -> list[str]:
    """Read each paragraph of an upstream code .docx as a source-line string."""
    d = docx.Document(str(path))
    return [pp.text for pp in d.paragraphs]


def _read_total_lines_from_form(form_path: Path) -> int | None:
    """Best-effort: pull "源程序量" from 申请表信息.txt (e.g. "4853 行" → 4853)."""
    if not form_path.exists():
        return None
    try:
        text = form_path.read_text(encoding="utf-8")
    except UnicodeDecodeError:
        text = form_path.read_text(encoding="utf-8", errors="ignore")
    m = re.search(r"源程序量[：:]\s*(\d+)", text)
    if m:
        return int(m.group(1))
    return None


def _safe_name(software_name: str) -> str:
    """Mirror upstream's filename normalization (strip Windows-illegal chars)."""
    return re.sub(r'[\\/:*?"<>|]', "", software_name).strip()


def _find_upstream_artifacts(final_dir: Path, software_name: str):
    """Return (manual, front, last, all_existing, form_txt) Paths or None."""
    safe = _safe_name(software_name)
    return (
        final_dir / f"{safe}_操作手册.docx",
        final_dir / f"{safe}-代码(前30页).docx",
        final_dir / f"{safe}-代码(后30页).docx",
        final_dir / f"{safe}-代码(全部).docx",
        final_dir / "申请表信息.txt",
    )


# ─────────────────────────────────────────────────────────────────────────────
# Re-emitters
# ─────────────────────────────────────────────────────────────────────────────

def _emit_code_docx(
    out_path: Path,
    lines: list[str],
    start_no: int,
    width: int,
    software_name: str,
    version: str,
    title: str,
) -> None:
    doc = _new_doc(software_name, version)
    _add_title_block(doc, title, software_name, version)
    for offset, line in enumerate(lines):
        _add_code_line(doc, start_no + offset, line, width)
    doc.save(str(out_path))


def _emit_all_docx(
    out_path: Path,
    front_lines: list[str],
    last_lines: list[str],
    last_start: int,
    width: int,
    software_name: str,
    version: str,
) -> None:
    doc = _new_doc(software_name, version)
    _add_title_block(doc, "源程序材料（全部 · 60 页超长项目摘录版）", software_name, version)

    note_p = doc.add_paragraph()
    _set_run_font(
        note_p.add_run(
            "说明：根据国家版权局软件著作权登记规则，源程序总量不足 60 页时按全部源代码提交；"
            "超过 60 页时按前 30 页与后 30 页合计提交。本《代码(全部).docx》为辅助参考材料，"
            f"前 {len(front_lines)} 行与后 {len(last_lines)} 行同步收录于"
            "《代码(前30页).docx》与《代码(后30页).docx》。"
            f"中间第 {len(front_lines) + 1} 行至第 {last_start - 1} 行依规则省略。"
        ),
        size_pt=FONT_SIZE_PT,
    )
    doc.add_paragraph()

    for idx, line in enumerate(front_lines, start=1):
        _add_code_line(doc, idx, line, width)

    if last_start - 1 > len(front_lines):
        gap_p = doc.add_paragraph()
        gap_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
        _set_run_font(
            gap_p.add_run(
                f"…  中间部分（第 {len(front_lines) + 1} 行 至 第 {last_start - 1} 行）"
                "依国家版权局规则省略  …"
            ),
            size_pt=FONT_SIZE_PT,
            italic=True,
        )

    for offset, line in enumerate(last_lines):
        _add_code_line(doc, last_start + offset, line, width)

    doc.save(str(out_path))


# ─────────────────────────────────────────────────────────────────────────────
# Deterministic-zip normalize
# ─────────────────────────────────────────────────────────────────────────────

_DCTERMS_NS = "http://purl.org/dc/terms/"
_DCTERMS_CREATED = f"{{{_DCTERMS_NS}}}created"
_DCTERMS_MODIFIED = f"{{{_DCTERMS_NS}}}modified"


def _patch_core_xml(data: bytes) -> bytes:
    """Pin `<dcterms:created>` / `<dcterms:modified>` in `docProps/core.xml`.

    python-docx writes the wall clock into these elements every time it saves.
    Without this patch, two otherwise-identical runs of the pipeline produce
    different `core.xml` payloads, which propagates into the zip-entry CRC and
    therefore into the final .docx fingerprint. We rewrite the text content to the
    fixed `EPOCH_ISO` value so the bytes are deterministic.
    """
    try:
        text = data.decode("utf-8")
    except UnicodeDecodeError:
        return data

    # Closing-tag rewrite, namespace-agnostic. Office Open XML emits these
    # as `<dcterms:created xsi:type="dcterms:W3CDTF">2026-…</dcterms:created>`;
    # we only mutate the text between the open and close tags.
    def _rewrite(text: str, local: str) -> str:
        pattern = re.compile(
            rf"(<[^<>]*:{local}[^<>]*>)[^<>]*(</[^<>]*:{local}>)", re.DOTALL
        )
        return pattern.sub(rf"\g<1>{EPOCH_ISO}\g<2>", text)

    text = _rewrite(text, "created")
    text = _rewrite(text, "modified")
    return text.encode("utf-8")


def _normalize_docx_deterministically(path: Path) -> None:
    """Rebuild `path` as a zip whose entries are byte-deterministic.

    Steps:
      1. Read the .docx as a zip — every member is in-memory.
      2. Patch `docProps/core.xml` to pin dcterms timestamps to `EPOCH_ISO`.
      3. Rewrite the zip with every entry's date_time pinned to `EPOCH_DATE_TIME`,
         `create_system = 0` (DOS), no extra fields, and `ZIP_DEFLATED` at the
         default zlib level so a reproducible run of the same pipeline lands on
         byte-identical bytes regardless of when it ran.
    """
    if not path.exists() or path.suffix.lower() != ".docx":
        return

    with zipfile.ZipFile(path, "r") as src:
        infos = src.infolist()
        members: list[tuple[zipfile.ZipInfo, bytes]] = []
        for info in infos:
            payload = src.read(info.filename)
            if info.filename == "docProps/core.xml":
                payload = _patch_core_xml(payload)
            members.append((info, payload))

    buf = io.BytesIO()
    with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as dst:
        for info, payload in members:
            new_info = zipfile.ZipInfo(
                filename=info.filename,
                date_time=EPOCH_DATE_TIME,
            )
            new_info.compress_type = zipfile.ZIP_DEFLATED
            new_info.create_system = 0  # MS-DOS
            new_info.external_attr = info.external_attr
            new_info.internal_attr = info.internal_attr
            new_info.flag_bits = 0
            new_info.extra = b""
            dst.writestr(new_info, payload)

    path.write_bytes(buf.getvalue())


# ─────────────────────────────────────────────────────────────────────────────
# Driver
# ─────────────────────────────────────────────────────────────────────────────

def main(argv: list[str] | None = None) -> int:
    p = argparse.ArgumentParser(
        description=(
            "Re-emit upstream Fokkyp/SoftwareCopyright-Skill code .docx output "
            "with SimSun 10.5pt + visible line numbers, and add a 4th "
            "代码(全部).docx so the final deliverable is 4 .docx + 1 .txt."
        ),
    )
    p.add_argument(
        "--workdir",
        default="软件著作权申请资料",
        help="Upstream output dir (the one that contains 正式资料/). Default: 软件著作权申请资料",
    )
    p.add_argument("--software-name", required=True, help="Same value you passed to upstream build_docx_from_md.py")
    p.add_argument("--version", default="V1.0", help="Software version; default V1.0")
    p.add_argument(
        "--total-lines",
        type=int,
        default=None,
        help="Override 源程序量 (auto-derived from 申请表信息.txt when omitted).",
    )
    p.add_argument(
        "--keep-upstream-backup",
        action="store_true",
        help="Keep the upstream-format code .docx as <name>.upstream.docx before overwriting.",
    )
    args = p.parse_args(argv)

    workdir = Path(args.workdir).resolve()
    final_dir = workdir / "正式资料"
    if not final_dir.is_dir():
        sys.stderr.write(
            f"[format-compliance] expected '{final_dir}' to exist after running "
            "upstream build_docx_from_md.py — found nothing.\n"
        )
        return 2

    manual_path, front_path, last_path, all_path, form_path = _find_upstream_artifacts(
        final_dir, args.software_name
    )

    if not front_path.exists() or not last_path.exists():
        sys.stderr.write(
            "[format-compliance] expected the upstream's two code .docx files in "
            f"{final_dir}.\n  looking for: {front_path.name} and {last_path.name}\n"
        )
        return 2

    total_lines = args.total_lines or _read_total_lines_from_form(form_path)

    front_lines = _read_lines_from_docx(front_path)
    last_lines = _read_lines_from_docx(last_path)

    # Numbering scheme: front 1..len(front). last_start derived from total lines
    # so the file matches the user's claimed source-program size. If the form
    # doesn't tell us the total, fall back to concatenated numbering.
    if total_lines and total_lines >= len(front_lines) + len(last_lines):
        last_start = total_lines - len(last_lines) + 1
    else:
        last_start = len(front_lines) + 1

    width = max(4, len(str(last_start + len(last_lines) - 1)))

    if args.keep_upstream_backup:
        shutil.copy2(front_path, front_path.with_suffix(".upstream.docx"))
        shutil.copy2(last_path, last_path.with_suffix(".upstream.docx"))

    _emit_code_docx(
        front_path,
        front_lines,
        start_no=1,
        width=width,
        software_name=args.software_name,
        version=args.version,
        title="源程序材料（前 30 页）",
    )
    _emit_code_docx(
        last_path,
        last_lines,
        start_no=last_start,
        width=width,
        software_name=args.software_name,
        version=args.version,
        title="源程序材料（后 30 页）",
    )
    _emit_all_docx(
        all_path,
        front_lines,
        last_lines,
        last_start=last_start,
        width=width,
        software_name=args.software_name,
        version=args.version,
    )

    # Make every .docx in the final dir byte-deterministic across runs.
    # Without this, DOCX zip entry timestamps + docProps/core.xml dcterms
    # drift on each run, so identical-content runs produce different fingerprints
    # — the regression this step prevents. Pinning DOS date_time and
    # dcterms:created/modified makes the fingerprint a function of input only.
    normalized = []
    for candidate in (manual_path, front_path, last_path, all_path):
        if candidate.exists():
            _normalize_docx_deterministically(candidate)
            normalized.append(candidate.name)

    print(
        f"[format-compliance] ok\n"
        f"  · {front_path.name}  ({len(front_lines)} lines, 1..{len(front_lines)})\n"
        f"  · {last_path.name}   ({len(last_lines)} lines, {last_start}..{last_start + len(last_lines) - 1})\n"
        f"  · {all_path.name}    (front + gap + last)\n"
        "All code .docx now render in SimSun 10.5pt with a visible '<n> │ ' line-number prefix."
    )

    if manual_path.exists():
        print(f"  · manual untouched (content):  {manual_path.name}")
    if form_path.exists():
        print(f"  · application form untouched: {form_path.name}")
    if normalized:
        print(
            f"  · {len(normalized)} .docx normalized for byte-determinism "
            f"(zip entry timestamps + dcterms pinned to {EPOCH_ISO})."
        )

    print("\nNext: upload to https://register.ccopyright.com.cn")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())