Files

205 lines
6.4 KiB
Python
Raw Permalink Normal View History

"""
AMEX NZ PDF Statement Parser
Handles the Airpoints Platinum Card statement format where transactions
appear as: DD.MM.YY MERCHANT NAME [LOCATION] AMOUNT [CR]
"""
import re
import pdfplumber
from dataclasses import dataclass, field
from typing import Optional
DATE_RE = re.compile(r'^(\d{2}\.\d{2}\.\d{2})\s+(.*)')
AMOUNT_RE = re.compile(r'^([\d,]+\.\d{2})(CR)?$')
AMOUNT_INLINE_RE = re.compile(r'^(.*?)\s+([\d,]+\.\d{2})\s*(CR)?$')
SKIP_PATTERNS = [
r'^MATTHEW BRUCE COHEN',
r'^XXXX-XXXXXX-\d+',
r'^Page \d+ / \d+',
r'^Details\s+Foreign Spending',
r'^Amount \$',
r'^Prepared for',
r'^Membership Number',
r'^Opening Date',
r'^Closing Date',
r'^Airpoints Platinum Card',
r'^Statement',
r'^americanexpress',
r'^Please check all',
r'^Total of New Transactions',
r'^Opening Balance',
r'^Credit Summary',
r'^Current Rate of Interest',
r'^Statement Period',
r'^Annual Rate',
r'^Credit Limit',
r'^\d+\.\d{2}\s*[-+]', # summary balance line
r'^Minimum Payment',
r'^Due by',
r'^NZD \d', # foreign currency note
r'^\d+\.\d{2} UNITED STATES',
r'^DOLLAR',
r'^NZD \d+\.\d{2} includes',
r'^\.\.\.',
r'^If you',
r'^Please pay',
r'^Visit www',
r'^balance\.',
r'^interest\.',
r'^American Express',
r'^Incorporated',
r'^PO Box',
r'^Auckland',
r'^New Zealand',
]
SKIP_RES = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS]
@dataclass
class Transaction:
date: str # DD.MM.YY
description: str
amount: float
is_credit: bool
def _should_skip(line: str) -> bool:
for pattern in SKIP_RES:
if pattern.match(line):
return True
return False
def _parse_amount(text: str) -> tuple[Optional[float], bool]:
"""Extract amount and credit flag from a string like '1,242.55CR' or '21.45'."""
text = text.strip()
m = AMOUNT_RE.match(text)
if m:
amount = float(m.group(1).replace(',', ''))
is_credit = bool(m.group(2))
return amount, is_credit
return None, False
def _extract_lines(page) -> list[str]:
"""
Use word bounding boxes to reconstruct lines.
Groups words by Y-position and sorts each group by X.
Amounts (rightmost column) are appended at end of line with a tab separator.
"""
words = page.extract_words(x_tolerance=5, y_tolerance=4)
if not words:
return []
# Determine amount-column threshold (rightmost ~22% of page)
page_width = page.width
amount_threshold = page_width * 0.78
# Group words by rounded Y position
buckets: dict[int, dict] = {}
for w in words:
key = round(w['top'] / 4) * 4
if key not in buckets:
buckets[key] = {'left': [], 'right': []}
if w['x0'] >= amount_threshold:
buckets[key]['right'].append((w['x0'], w['text']))
else:
buckets[key]['left'].append((w['x0'], w['text']))
lines = []
for y in sorted(buckets):
left = ' '.join(t for _, t in sorted(buckets[y]['left']))
right = ' '.join(t for _, t in sorted(buckets[y]['right']))
line = left
if right:
line = f"{left}\t{right}" if left else right
lines.append(line.strip())
return [l for l in lines if l]
def parse_statement(pdf_path: str) -> list[Transaction]:
"""Parse all transactions from an AMEX NZ PDF statement."""
transactions: list[Transaction] = []
pending: Optional[dict] = None
def commit(tx):
if tx and tx.get('amount') is not None:
transactions.append(Transaction(
date=tx['date'],
description=tx['description'].strip(),
amount=tx['amount'],
is_credit=tx['is_credit'],
))
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
for raw_line in _extract_lines(page):
# Split into description part and amount part (tab-separated)
if '\t' in raw_line:
desc_part, amount_part = raw_line.split('\t', 1)
else:
desc_part, amount_part = raw_line, ''
desc_part = desc_part.strip()
amount_part = amount_part.strip()
if _should_skip(desc_part) and not amount_part:
continue
# Check if line starts a new transaction
date_match = DATE_RE.match(desc_part)
if date_match:
commit(pending)
date = date_match.group(1)
remainder = date_match.group(2).strip()
# Amount might be inline in the description part
inline = AMOUNT_INLINE_RE.match(remainder)
if inline:
description = inline.group(1).strip()
amount = float(inline.group(2).replace(',', ''))
is_credit = bool(inline.group(3))
else:
description = remainder
amount = None
is_credit = False
# Check right-column amount
if amount_part:
a, c = _parse_amount(amount_part)
if a is not None:
amount, is_credit = a, c
pending = {
'date': date,
'description': description,
'amount': amount,
'is_credit': is_credit,
}
elif pending:
# Continuation line — could be amount or CR
if amount_part and pending['amount'] is None:
a, c = _parse_amount(amount_part)
if a is not None:
pending['amount'] = a
pending['is_credit'] = c
# Plain amount on its own line (no tab split)
stripped = desc_part.strip()
if not amount_part:
a, c = _parse_amount(stripped)
if a is not None and pending['amount'] is None:
pending['amount'] = a
pending['is_credit'] = c
elif stripped == 'CR':
pending['is_credit'] = True
commit(pending)
return transactions