Initial commit — AmexPal statement analyser
Python/Flask backend with pdfplumber parser, Svelte 4 frontend, Docker multi-stage build. Includes category analysis, insights, monthly/weekly charts, subscription audit, and annualised projections. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+204
@@ -0,0 +1,204 @@
|
||||
"""
|
||||
AMEX NZ PDF Statement Parser
|
||||
|
||||
Handles the Airpoints Platinum Card statement format where transactions
|
||||
appear as: DD.MM.YY MERCHANT NAME [LOCATION] AMOUNT [CR]
|
||||
"""
|
||||
|
||||
import re
|
||||
import pdfplumber
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
DATE_RE = re.compile(r'^(\d{2}\.\d{2}\.\d{2})\s+(.*)')
|
||||
AMOUNT_RE = re.compile(r'^([\d,]+\.\d{2})(CR)?$')
|
||||
AMOUNT_INLINE_RE = re.compile(r'^(.*?)\s+([\d,]+\.\d{2})\s*(CR)?$')
|
||||
|
||||
SKIP_PATTERNS = [
|
||||
r'^MATTHEW BRUCE COHEN',
|
||||
r'^XXXX-XXXXXX-\d+',
|
||||
r'^Page \d+ / \d+',
|
||||
r'^Details\s+Foreign Spending',
|
||||
r'^Amount \$',
|
||||
r'^Prepared for',
|
||||
r'^Membership Number',
|
||||
r'^Opening Date',
|
||||
r'^Closing Date',
|
||||
r'^Airpoints Platinum Card',
|
||||
r'^Statement',
|
||||
r'^americanexpress',
|
||||
r'^Please check all',
|
||||
r'^Total of New Transactions',
|
||||
r'^Opening Balance',
|
||||
r'^Credit Summary',
|
||||
r'^Current Rate of Interest',
|
||||
r'^Statement Period',
|
||||
r'^Annual Rate',
|
||||
r'^Credit Limit',
|
||||
r'^\d+\.\d{2}\s*[-+]', # summary balance line
|
||||
r'^Minimum Payment',
|
||||
r'^Due by',
|
||||
r'^NZD \d', # foreign currency note
|
||||
r'^\d+\.\d{2} UNITED STATES',
|
||||
r'^DOLLAR',
|
||||
r'^NZD \d+\.\d{2} includes',
|
||||
r'^\.\.\.',
|
||||
r'^If you',
|
||||
r'^Please pay',
|
||||
r'^Visit www',
|
||||
r'^balance\.',
|
||||
r'^interest\.',
|
||||
r'^American Express',
|
||||
r'^Incorporated',
|
||||
r'^PO Box',
|
||||
r'^Auckland',
|
||||
r'^New Zealand',
|
||||
]
|
||||
|
||||
SKIP_RES = [re.compile(p, re.IGNORECASE) for p in SKIP_PATTERNS]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Transaction:
|
||||
date: str # DD.MM.YY
|
||||
description: str
|
||||
amount: float
|
||||
is_credit: bool
|
||||
|
||||
|
||||
def _should_skip(line: str) -> bool:
|
||||
for pattern in SKIP_RES:
|
||||
if pattern.match(line):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _parse_amount(text: str) -> tuple[Optional[float], bool]:
|
||||
"""Extract amount and credit flag from a string like '1,242.55CR' or '21.45'."""
|
||||
text = text.strip()
|
||||
m = AMOUNT_RE.match(text)
|
||||
if m:
|
||||
amount = float(m.group(1).replace(',', ''))
|
||||
is_credit = bool(m.group(2))
|
||||
return amount, is_credit
|
||||
return None, False
|
||||
|
||||
|
||||
def _extract_lines(page) -> list[str]:
|
||||
"""
|
||||
Use word bounding boxes to reconstruct lines.
|
||||
Groups words by Y-position and sorts each group by X.
|
||||
Amounts (rightmost column) are appended at end of line with a tab separator.
|
||||
"""
|
||||
words = page.extract_words(x_tolerance=5, y_tolerance=4)
|
||||
if not words:
|
||||
return []
|
||||
|
||||
# Determine amount-column threshold (rightmost ~22% of page)
|
||||
page_width = page.width
|
||||
amount_threshold = page_width * 0.78
|
||||
|
||||
# Group words by rounded Y position
|
||||
buckets: dict[int, dict] = {}
|
||||
for w in words:
|
||||
key = round(w['top'] / 4) * 4
|
||||
if key not in buckets:
|
||||
buckets[key] = {'left': [], 'right': []}
|
||||
if w['x0'] >= amount_threshold:
|
||||
buckets[key]['right'].append((w['x0'], w['text']))
|
||||
else:
|
||||
buckets[key]['left'].append((w['x0'], w['text']))
|
||||
|
||||
lines = []
|
||||
for y in sorted(buckets):
|
||||
left = ' '.join(t for _, t in sorted(buckets[y]['left']))
|
||||
right = ' '.join(t for _, t in sorted(buckets[y]['right']))
|
||||
line = left
|
||||
if right:
|
||||
line = f"{left}\t{right}" if left else right
|
||||
lines.append(line.strip())
|
||||
|
||||
return [l for l in lines if l]
|
||||
|
||||
|
||||
def parse_statement(pdf_path: str) -> list[Transaction]:
|
||||
"""Parse all transactions from an AMEX NZ PDF statement."""
|
||||
transactions: list[Transaction] = []
|
||||
pending: Optional[dict] = None
|
||||
|
||||
def commit(tx):
|
||||
if tx and tx.get('amount') is not None:
|
||||
transactions.append(Transaction(
|
||||
date=tx['date'],
|
||||
description=tx['description'].strip(),
|
||||
amount=tx['amount'],
|
||||
is_credit=tx['is_credit'],
|
||||
))
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
for raw_line in _extract_lines(page):
|
||||
# Split into description part and amount part (tab-separated)
|
||||
if '\t' in raw_line:
|
||||
desc_part, amount_part = raw_line.split('\t', 1)
|
||||
else:
|
||||
desc_part, amount_part = raw_line, ''
|
||||
|
||||
desc_part = desc_part.strip()
|
||||
amount_part = amount_part.strip()
|
||||
|
||||
if _should_skip(desc_part) and not amount_part:
|
||||
continue
|
||||
|
||||
# Check if line starts a new transaction
|
||||
date_match = DATE_RE.match(desc_part)
|
||||
if date_match:
|
||||
commit(pending)
|
||||
date = date_match.group(1)
|
||||
remainder = date_match.group(2).strip()
|
||||
|
||||
# Amount might be inline in the description part
|
||||
inline = AMOUNT_INLINE_RE.match(remainder)
|
||||
if inline:
|
||||
description = inline.group(1).strip()
|
||||
amount = float(inline.group(2).replace(',', ''))
|
||||
is_credit = bool(inline.group(3))
|
||||
else:
|
||||
description = remainder
|
||||
amount = None
|
||||
is_credit = False
|
||||
|
||||
# Check right-column amount
|
||||
if amount_part:
|
||||
a, c = _parse_amount(amount_part)
|
||||
if a is not None:
|
||||
amount, is_credit = a, c
|
||||
|
||||
pending = {
|
||||
'date': date,
|
||||
'description': description,
|
||||
'amount': amount,
|
||||
'is_credit': is_credit,
|
||||
}
|
||||
|
||||
elif pending:
|
||||
# Continuation line — could be amount or CR
|
||||
if amount_part and pending['amount'] is None:
|
||||
a, c = _parse_amount(amount_part)
|
||||
if a is not None:
|
||||
pending['amount'] = a
|
||||
pending['is_credit'] = c
|
||||
|
||||
# Plain amount on its own line (no tab split)
|
||||
stripped = desc_part.strip()
|
||||
if not amount_part:
|
||||
a, c = _parse_amount(stripped)
|
||||
if a is not None and pending['amount'] is None:
|
||||
pending['amount'] = a
|
||||
pending['is_credit'] = c
|
||||
elif stripped == 'CR':
|
||||
pending['is_credit'] = True
|
||||
|
||||
commit(pending)
|
||||
|
||||
return transactions
|
||||
Reference in New Issue
Block a user