""" AMEX NZ PDF Statement Parser Handles the Airpoints Platinum Card statement format where transactions appear as: DD.MM.YY MERCHANT NAME [LOCATION] AMOUNT [CR] """ import re import pdfplumber from dataclasses import dataclass, field from typing import Optional DATE_RE = re.compile(r'^(\d{2}\.\d{2}\.\d{2})\s+(.*)') AMOUNT_RE = re.compile(r'^([\d,]+\.\d{2})(CR)?$') AMOUNT_INLINE_RE = re.compile(r'^(.*?)\s+([\d,]+\.\d{2})\s*(CR)?$') SKIP_PATTERNS = [ r'^MATTHEW BRUCE COHEN', r'^XXXX-XXXXXX-\d+', r'^Page \d+ / \d+', r'^Details\s+Foreign Spending', r'^Amount \$', r'^Prepared for', r'^Membership Number', r'^Opening Date', r'^Closing Date', r'^Airpoints Platinum Card', r'^Statement', r'^americanexpress', r'^Please check all', r'^Total of New Transactions', r'^Opening Balance', r'^Credit Summary', r'^Current Rate of Interest', r'^Statement Period', r'^Annual Rate', r'^Credit Limit', r'^\d+\.\d{2}\s*[-+]', # summary balance line r'^Minimum Payment', r'^Due by', r'^NZD \d', # foreign currency note r'^\d+\.\d{2} UNITED STATES', r'^DOLLAR', r'^NZD \d+\.\d{2} includes', r'^\.\.\.', r'^If you', r'^Please pay', r'^Visit www', r'^balance\.', r'^interest\.', r'^American Express', r'^Incorporated', r'^PO Box', r'^Auckland', r'^New Zealand', ] SKIP_RES = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS] @dataclass class Transaction: date: str # DD.MM.YY description: str amount: float is_credit: bool def _should_skip(line: str) -> bool: for pattern in SKIP_RES: if pattern.match(line): return True return False def _parse_amount(text: str) -> tuple[Optional[float], bool]: """Extract amount and credit flag from a string like '1,242.55CR' or '21.45'.""" text = text.strip() m = AMOUNT_RE.match(text) if m: amount = float(m.group(1).replace(',', '')) is_credit = bool(m.group(2)) return amount, is_credit return None, False def _extract_lines(page) -> list[str]: """ Use word bounding boxes to reconstruct lines. Groups words by Y-position and sorts each group by X. Amounts (rightmost column) are appended at end of line with a tab separator. """ words = page.extract_words(x_tolerance=5, y_tolerance=4) if not words: return [] # Determine amount-column threshold (rightmost ~22% of page) page_width = page.width amount_threshold = page_width * 0.78 # Group words by rounded Y position buckets: dict[int, dict] = {} for w in words: key = round(w['top'] / 4) * 4 if key not in buckets: buckets[key] = {'left': [], 'right': []} if w['x0'] >= amount_threshold: buckets[key]['right'].append((w['x0'], w['text'])) else: buckets[key]['left'].append((w['x0'], w['text'])) lines = [] for y in sorted(buckets): left = ' '.join(t for _, t in sorted(buckets[y]['left'])) right = ' '.join(t for _, t in sorted(buckets[y]['right'])) line = left if right: line = f"{left}\t{right}" if left else right lines.append(line.strip()) return [l for l in lines if l] def parse_statement(pdf_path: str) -> list[Transaction]: """Parse all transactions from an AMEX NZ PDF statement.""" transactions: list[Transaction] = [] pending: Optional[dict] = None def commit(tx): if tx and tx.get('amount') is not None: transactions.append(Transaction( date=tx['date'], description=tx['description'].strip(), amount=tx['amount'], is_credit=tx['is_credit'], )) with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: for raw_line in _extract_lines(page): # Split into description part and amount part (tab-separated) if '\t' in raw_line: desc_part, amount_part = raw_line.split('\t', 1) else: desc_part, amount_part = raw_line, '' desc_part = desc_part.strip() amount_part = amount_part.strip() if _should_skip(desc_part) and not amount_part: continue # Check if line starts a new transaction date_match = DATE_RE.match(desc_part) if date_match: commit(pending) date = date_match.group(1) remainder = date_match.group(2).strip() # Amount might be inline in the description part inline = AMOUNT_INLINE_RE.match(remainder) if inline: description = inline.group(1).strip() amount = float(inline.group(2).replace(',', '')) is_credit = bool(inline.group(3)) else: description = remainder amount = None is_credit = False # Check right-column amount if amount_part: a, c = _parse_amount(amount_part) if a is not None: amount, is_credit = a, c pending = { 'date': date, 'description': description, 'amount': amount, 'is_credit': is_credit, } elif pending: # Continuation line — could be amount or CR if amount_part and pending['amount'] is None: a, c = _parse_amount(amount_part) if a is not None: pending['amount'] = a pending['is_credit'] = c # Plain amount on its own line (no tab split) stripped = desc_part.strip() if not amount_part: a, c = _parse_amount(stripped) if a is not None and pending['amount'] is None: pending['amount'] = a pending['is_credit'] = c elif stripped == 'CR': pending['is_credit'] = True commit(pending) return transactions