Add spreadsheet diff by designator
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
122
diff_spreadsheets.py
Normal file
122
diff_spreadsheets.py
Normal file
@@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Diff two spreadsheets by designator column. Data starts at row 10 by default.
|
||||
|
||||
Usage:
|
||||
python3 diff_spreadsheets.py file1.xlsx file2.xlsx [-o output.json]
|
||||
python3 diff_spreadsheets.py # uses SHEET1, SHEET2, DIFF_OUTPUT from .env
|
||||
|
||||
All paths and options: use .env or CLI; CLI overrides .env.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
print("Install pandas and openpyxl: pip install pandas openpyxl", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def read_designators(path: str, designator_col: int = 0, start_row: int = 9) -> set[str]:
|
||||
"""Load spreadsheet and return set of designator values from the given column, starting at start_row (0-based; 9 = row 10)."""
|
||||
p = Path(path)
|
||||
if not p.exists():
|
||||
raise FileNotFoundError(path)
|
||||
suffix = p.suffix.lower()
|
||||
if suffix in (".xlsx", ".xls"):
|
||||
df = pd.read_excel(path, header=None, engine="openpyxl" if suffix == ".xlsx" else None)
|
||||
elif suffix == ".csv":
|
||||
df = pd.read_csv(path, header=None)
|
||||
else:
|
||||
raise ValueError(f"Unsupported format: {suffix}")
|
||||
if designator_col >= df.shape[1]:
|
||||
raise ValueError(f"Column {designator_col} not in sheet (has {df.shape[1]} columns)")
|
||||
# from start_row to end, take the designator column, drop NaN, strip strings
|
||||
col = df.iloc[start_row:, designator_col]
|
||||
values = set()
|
||||
for v in col:
|
||||
if pd.isna(v):
|
||||
continue
|
||||
s = str(v).strip()
|
||||
if s:
|
||||
values.add(s)
|
||||
return values
|
||||
|
||||
|
||||
def main() -> int:
|
||||
default1 = os.environ.get("SHEET1", "").strip() or None
|
||||
default2 = os.environ.get("SHEET2", "").strip() or None
|
||||
default_out = os.environ.get("DIFF_OUTPUT", "").strip() or "output/spreadsheet_diff.json"
|
||||
default_col = os.environ.get("DESIGNATOR_COL", "").strip()
|
||||
default_start = os.environ.get("START_ROW", "").strip()
|
||||
try:
|
||||
default_col = int(default_col) if default_col else 0
|
||||
except ValueError:
|
||||
default_col = 0
|
||||
try:
|
||||
default_start = int(default_start) if default_start else 9
|
||||
except ValueError:
|
||||
default_start = 9
|
||||
|
||||
parser = argparse.ArgumentParser(description="Diff two spreadsheets by designator column")
|
||||
parser.add_argument("file1", nargs="?", default=default1, help="First spreadsheet (default: SHEET1 from .env)")
|
||||
parser.add_argument("file2", nargs="?", default=default2, help="Second spreadsheet (default: SHEET2 from .env)")
|
||||
parser.add_argument("-o", "--output", default=default_out, help="Output JSON (default: DIFF_OUTPUT from .env)")
|
||||
parser.add_argument("--designator-col", type=int, default=default_col, help="Designator column 0-based (default: DESIGNATOR_COL from .env or 0)")
|
||||
parser.add_argument("--start-row", type=int, default=default_start, help="First data row 0-based, 9=row 10 (default: START_ROW from .env or 9)")
|
||||
args = parser.parse_args()
|
||||
|
||||
path1 = (args.file1 or default1 or "").strip()
|
||||
path2 = (args.file2 or default2 or "").strip()
|
||||
if not path1 or not path2:
|
||||
parser.error("Need two files. Set SHEET1 and SHEET2 in .env or pass two paths.")
|
||||
|
||||
try:
|
||||
d1 = read_designators(path1, args.designator_col, args.start_row)
|
||||
d2 = read_designators(path2, args.designator_col, args.start_row)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
only1 = sorted(d1 - d2)
|
||||
only2 = sorted(d2 - d1)
|
||||
both = sorted(d1 & d2)
|
||||
|
||||
report = {
|
||||
"file1": path1,
|
||||
"file2": path2,
|
||||
"designator_col": args.designator_col,
|
||||
"start_row": args.start_row + 1,
|
||||
"only_in_file1": only1,
|
||||
"only_in_file2": only2,
|
||||
"in_both": both,
|
||||
"count_only_in_file1": len(only1),
|
||||
"count_only_in_file2": len(only2),
|
||||
"count_in_both": len(both),
|
||||
}
|
||||
|
||||
out_path = Path(args.output)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
||||
print(f"Wrote {args.output}")
|
||||
print(f"Only in file1: {len(only1)} | Only in file2: {len(only2)} | In both: {len(both)}")
|
||||
if only1:
|
||||
print(" Only in file1:", ", ".join(only1[:20]) + (" ..." if len(only1) > 20 else ""))
|
||||
if only2:
|
||||
print(" Only in file2:", ", ".join(only2[:20]) + (" ..." if len(only2) > 20 else ""))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user