#!/usr/bin/env python3 """ Diff two spreadsheets by designator column. Data starts at row 10 by default. Usage: python3 diff_spreadsheets.py file1.xlsx file2.xlsx [-o output.json] python3 diff_spreadsheets.py # uses SHEET1, SHEET2, DIFF_OUTPUT from .env All paths and options: use .env or CLI; CLI overrides .env. """ import argparse import json import os import sys from pathlib import Path try: from dotenv import load_dotenv load_dotenv() except ImportError: pass try: import pandas as pd except ImportError: print("Install pandas and openpyxl: pip install pandas openpyxl", file=sys.stderr) sys.exit(1) def read_designators(path: str, designator_col: int = 0, start_row: int = 9) -> set[str]: """Load spreadsheet and return set of designator values from the given column, starting at start_row (0-based; 9 = row 10).""" p = Path(path) if not p.exists(): raise FileNotFoundError(path) suffix = p.suffix.lower() if suffix in (".xlsx", ".xls"): df = pd.read_excel(path, header=None, engine="openpyxl" if suffix == ".xlsx" else None) elif suffix == ".csv": df = pd.read_csv(path, header=None) else: raise ValueError(f"Unsupported format: {suffix}") if designator_col >= df.shape[1]: raise ValueError(f"Column {designator_col} not in sheet (has {df.shape[1]} columns)") # from start_row to end, take the designator column, drop NaN, strip strings col = df.iloc[start_row:, designator_col] values = set() for v in col: if pd.isna(v): continue s = str(v).strip() if s: values.add(s) return values def main() -> int: default1 = os.environ.get("SHEET1", "").strip() or None default2 = os.environ.get("SHEET2", "").strip() or None default_out = os.environ.get("DIFF_OUTPUT", "").strip() or "outputs/spreadsheet_diff.json" default_col = os.environ.get("DESIGNATOR_COL", "").strip() default_start = os.environ.get("START_ROW", "").strip() try: default_col = int(default_col) if default_col else 0 except ValueError: default_col = 0 try: default_start = int(default_start) if default_start else 9 except ValueError: default_start = 9 parser = argparse.ArgumentParser(description="Diff two spreadsheets by designator column") parser.add_argument("file1", nargs="?", default=default1, help="First spreadsheet (default: SHEET1 from .env)") parser.add_argument("file2", nargs="?", default=default2, help="Second spreadsheet (default: SHEET2 from .env)") parser.add_argument("-o", "--output", default=default_out, help="Output JSON (default: DIFF_OUTPUT from .env)") parser.add_argument("--designator-col", type=int, default=default_col, help="Designator column 0-based (default: DESIGNATOR_COL from .env or 0)") parser.add_argument("--start-row", type=int, default=default_start, help="First data row 0-based, 9=row 10 (default: START_ROW from .env or 9)") args = parser.parse_args() path1 = (args.file1 or default1 or "").strip() path2 = (args.file2 or default2 or "").strip() if not path1 or not path2: parser.error("Need two files. Set SHEET1 and SHEET2 in .env or pass two paths.") try: d1 = read_designators(path1, args.designator_col, args.start_row) d2 = read_designators(path2, args.designator_col, args.start_row) except Exception as e: print(f"Error: {e}", file=sys.stderr) return 1 only1 = sorted(d1 - d2) only2 = sorted(d2 - d1) both = sorted(d1 & d2) report = { "file1": path1, "file2": path2, "designator_col": args.designator_col, "start_row": args.start_row + 1, "only_in_file1": only1, "only_in_file2": only2, "in_both": both, "count_only_in_file1": len(only1), "count_only_in_file2": len(only2), "count_in_both": len(both), } out_path = Path(args.output) out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(json.dumps(report, indent=2), encoding="utf-8") print(f"Wrote {args.output}") print(f"Only in file1: {len(only1)} | Only in file2: {len(only2)} | In both: {len(both)}") if only1: print(" Only in file1:", ", ".join(only1[:20]) + (" ..." if len(only1) > 20 else "")) if only2: print(" Only in file2:", ", ".join(only2[:20]) + (" ..." if len(only2) > 20 else "")) return 0 if __name__ == "__main__": sys.exit(main())