205 lines
6.4 KiB
Python
205 lines
6.4 KiB
Python
|
|
"""
|
||
|
|
AMEX NZ PDF Statement Parser
|
||
|
|
|
||
|
|
Handles the Airpoints Platinum Card statement format where transactions
|
||
|
|
appear as: DD.MM.YY MERCHANT NAME [LOCATION] AMOUNT [CR]
|
||
|
|
"""
|
||
|
|
|
||
|
|
import re
|
||
|
|
import pdfplumber
|
||
|
|
from dataclasses import dataclass, field
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
DATE_RE = re.compile(r'^(\d{2}\.\d{2}\.\d{2})\s+(.*)')
|
||
|
|
AMOUNT_RE = re.compile(r'^([\d,]+\.\d{2})(CR)?$')
|
||
|
|
AMOUNT_INLINE_RE = re.compile(r'^(.*?)\s+([\d,]+\.\d{2})\s*(CR)?$')
|
||
|
|
|
||
|
|
SKIP_PATTERNS = [
|
||
|
|
r'^MATTHEW BRUCE COHEN',
|
||
|
|
r'^XXXX-XXXXXX-\d+',
|
||
|
|
r'^Page \d+ / \d+',
|
||
|
|
r'^Details\s+Foreign Spending',
|
||
|
|
r'^Amount \$',
|
||
|
|
r'^Prepared for',
|
||
|
|
r'^Membership Number',
|
||
|
|
r'^Opening Date',
|
||
|
|
r'^Closing Date',
|
||
|
|
r'^Airpoints Platinum Card',
|
||
|
|
r'^Statement',
|
||
|
|
r'^americanexpress',
|
||
|
|
r'^Please check all',
|
||
|
|
r'^Total of New Transactions',
|
||
|
|
r'^Opening Balance',
|
||
|
|
r'^Credit Summary',
|
||
|
|
r'^Current Rate of Interest',
|
||
|
|
r'^Statement Period',
|
||
|
|
r'^Annual Rate',
|
||
|
|
r'^Credit Limit',
|
||
|
|
r'^\d+\.\d{2}\s*[-+]', # summary balance line
|
||
|
|
r'^Minimum Payment',
|
||
|
|
r'^Due by',
|
||
|
|
r'^NZD \d', # foreign currency note
|
||
|
|
r'^\d+\.\d{2} UNITED STATES',
|
||
|
|
r'^DOLLAR',
|
||
|
|
r'^NZD \d+\.\d{2} includes',
|
||
|
|
r'^\.\.\.',
|
||
|
|
r'^If you',
|
||
|
|
r'^Please pay',
|
||
|
|
r'^Visit www',
|
||
|
|
r'^balance\.',
|
||
|
|
r'^interest\.',
|
||
|
|
r'^American Express',
|
||
|
|
r'^Incorporated',
|
||
|
|
r'^PO Box',
|
||
|
|
r'^Auckland',
|
||
|
|
r'^New Zealand',
|
||
|
|
]
|
||
|
|
|
||
|
|
SKIP_RES = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS]
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class Transaction:
|
||
|
|
date: str # DD.MM.YY
|
||
|
|
description: str
|
||
|
|
amount: float
|
||
|
|
is_credit: bool
|
||
|
|
|
||
|
|
|
||
|
|
def _should_skip(line: str) -> bool:
|
||
|
|
for pattern in SKIP_RES:
|
||
|
|
if pattern.match(line):
|
||
|
|
return True
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_amount(text: str) -> tuple[Optional[float], bool]:
|
||
|
|
"""Extract amount and credit flag from a string like '1,242.55CR' or '21.45'."""
|
||
|
|
text = text.strip()
|
||
|
|
m = AMOUNT_RE.match(text)
|
||
|
|
if m:
|
||
|
|
amount = float(m.group(1).replace(',', ''))
|
||
|
|
is_credit = bool(m.group(2))
|
||
|
|
return amount, is_credit
|
||
|
|
return None, False
|
||
|
|
|
||
|
|
|
||
|
|
def _extract_lines(page) -> list[str]:
|
||
|
|
"""
|
||
|
|
Use word bounding boxes to reconstruct lines.
|
||
|
|
Groups words by Y-position and sorts each group by X.
|
||
|
|
Amounts (rightmost column) are appended at end of line with a tab separator.
|
||
|
|
"""
|
||
|
|
words = page.extract_words(x_tolerance=5, y_tolerance=4)
|
||
|
|
if not words:
|
||
|
|
return []
|
||
|
|
|
||
|
|
# Determine amount-column threshold (rightmost ~22% of page)
|
||
|
|
page_width = page.width
|
||
|
|
amount_threshold = page_width * 0.78
|
||
|
|
|
||
|
|
# Group words by rounded Y position
|
||
|
|
buckets: dict[int, dict] = {}
|
||
|
|
for w in words:
|
||
|
|
key = round(w['top'] / 4) * 4
|
||
|
|
if key not in buckets:
|
||
|
|
buckets[key] = {'left': [], 'right': []}
|
||
|
|
if w['x0'] >= amount_threshold:
|
||
|
|
buckets[key]['right'].append((w['x0'], w['text']))
|
||
|
|
else:
|
||
|
|
buckets[key]['left'].append((w['x0'], w['text']))
|
||
|
|
|
||
|
|
lines = []
|
||
|
|
for y in sorted(buckets):
|
||
|
|
left = ' '.join(t for _, t in sorted(buckets[y]['left']))
|
||
|
|
right = ' '.join(t for _, t in sorted(buckets[y]['right']))
|
||
|
|
line = left
|
||
|
|
if right:
|
||
|
|
line = f"{left}\t{right}" if left else right
|
||
|
|
lines.append(line.strip())
|
||
|
|
|
||
|
|
return [l for l in lines if l]
|
||
|
|
|
||
|
|
|
||
|
|
def parse_statement(pdf_path: str) -> list[Transaction]:
|
||
|
|
"""Parse all transactions from an AMEX NZ PDF statement."""
|
||
|
|
transactions: list[Transaction] = []
|
||
|
|
pending: Optional[dict] = None
|
||
|
|
|
||
|
|
def commit(tx):
|
||
|
|
if tx and tx.get('amount') is not None:
|
||
|
|
transactions.append(Transaction(
|
||
|
|
date=tx['date'],
|
||
|
|
description=tx['description'].strip(),
|
||
|
|
amount=tx['amount'],
|
||
|
|
is_credit=tx['is_credit'],
|
||
|
|
))
|
||
|
|
|
||
|
|
with pdfplumber.open(pdf_path) as pdf:
|
||
|
|
for page in pdf.pages:
|
||
|
|
for raw_line in _extract_lines(page):
|
||
|
|
# Split into description part and amount part (tab-separated)
|
||
|
|
if '\t' in raw_line:
|
||
|
|
desc_part, amount_part = raw_line.split('\t', 1)
|
||
|
|
else:
|
||
|
|
desc_part, amount_part = raw_line, ''
|
||
|
|
|
||
|
|
desc_part = desc_part.strip()
|
||
|
|
amount_part = amount_part.strip()
|
||
|
|
|
||
|
|
if _should_skip(desc_part) and not amount_part:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Check if line starts a new transaction
|
||
|
|
date_match = DATE_RE.match(desc_part)
|
||
|
|
if date_match:
|
||
|
|
commit(pending)
|
||
|
|
date = date_match.group(1)
|
||
|
|
remainder = date_match.group(2).strip()
|
||
|
|
|
||
|
|
# Amount might be inline in the description part
|
||
|
|
inline = AMOUNT_INLINE_RE.match(remainder)
|
||
|
|
if inline:
|
||
|
|
description = inline.group(1).strip()
|
||
|
|
amount = float(inline.group(2).replace(',', ''))
|
||
|
|
is_credit = bool(inline.group(3))
|
||
|
|
else:
|
||
|
|
description = remainder
|
||
|
|
amount = None
|
||
|
|
is_credit = False
|
||
|
|
|
||
|
|
# Check right-column amount
|
||
|
|
if amount_part:
|
||
|
|
a, c = _parse_amount(amount_part)
|
||
|
|
if a is not None:
|
||
|
|
amount, is_credit = a, c
|
||
|
|
|
||
|
|
pending = {
|
||
|
|
'date': date,
|
||
|
|
'description': description,
|
||
|
|
'amount': amount,
|
||
|
|
'is_credit': is_credit,
|
||
|
|
}
|
||
|
|
|
||
|
|
elif pending:
|
||
|
|
# Continuation line — could be amount or CR
|
||
|
|
if amount_part and pending['amount'] is None:
|
||
|
|
a, c = _parse_amount(amount_part)
|
||
|
|
if a is not None:
|
||
|
|
pending['amount'] = a
|
||
|
|
pending['is_credit'] = c
|
||
|
|
|
||
|
|
# Plain amount on its own line (no tab split)
|
||
|
|
stripped = desc_part.strip()
|
||
|
|
if not amount_part:
|
||
|
|
a, c = _parse_amount(stripped)
|
||
|
|
if a is not None and pending['amount'] is None:
|
||
|
|
pending['amount'] = a
|
||
|
|
pending['is_credit'] = c
|
||
|
|
elif stripped == 'CR':
|
||
|
|
pending['is_credit'] = True
|
||
|
|
|
||
|
|
commit(pending)
|
||
|
|
|
||
|
|
return transactions
|