#!/usr/bin/env python3
"""
pdf_translit_dynamic.py

Hybrid dynamic transliteration for Bangla land records (mixed Hindu+Muslim names).

Usage:
    python pdf_translit_dynamic.py /path/to/file.pdf
    python pdf_translit_dynamic.py /path/to/file.pdf --out-dir ./out --cache translit_cache.json

Outputs (default current working directory):
    <pdf_stem>_parsed.json
    <pdf_stem>_parsed_en_values.json

Features:
- PyMuPDF -> pdfplumber -> OCR fallback
- Rule-based transliteration (full coverage)
- Seed list of common names (Hindu + Muslim) for conventional spellings
- Fuzzy matching (difflib) to pick the best common variant
- Persistent JSON cache to learn mappings across runs
- Safe, fast, suitable for batch processing of thousands of PDFs
"""

from pathlib import Path
import argparse
import json
import re
import sys
from difflib import get_close_matches
from typing import Dict, Any

# -------------------------
# CONFIG: seed canonical name list for Mixed (Hindu + Muslim)
# Expand this list as you see common local variants; the system uses fuzzy matching.
# -------------------------
COMMON_ENGLISH_NAMES = [
    # Muslim given names & family (common spellings)
    "Sheikh","Shekh","Zakir","Zakir","Zakir","Hossain","Hussain","Haque","Haq","Abdur","Abdul","Rashid",
    "Nurul","Islam","Ansarul","Ansar","Nasibuddin","Nasib","Mohammad","Mohammed","Mohammad","Hanif","Sami",
    "Selim","Enamul","Enam","Hafiz","Sarker","Sarkar",
    # Hindu names / surnames
    "Mondal","Mandal","Mondol","Ghosh","Das","Dutta","Dutta","Roy","Roychowdhury","Bose","Mukherjee",
    "Sarkar","Sengupta","Biswas","Pal","Dey","Chatterjee","Chowdhury","Majumdar","Chakraborty","Adhikari",
    # gender or honorifics
    "Bibi","Khatun","Begum",
    # frequently seen combos
    "Indrani","Samaun","Idrish","Idris","Idrish","Ansarul","Samaun","Samaun Mondal"
]

# Make unique and lowercase for better comparisons
COMMON_SET = sorted({n for n in COMMON_ENGLISH_NAMES if n})

# -------------------------
# Transliteration tables (rule-based fallback)
# Keep these conservative ASCII approximations.
# -------------------------
CONS = {
 'ক':'k','খ':'kh','গ':'g','ঘ':'gh','ঙ':'ng',
 'চ':'ch','ছ':'chh','জ':'j','ঝ':'jh','ঞ':'nch',
 'ট':'t','ঠ':'th','ড':'d','ঢ':'dh','ণ':'n',
 'ত':'t','থ':'th','দ':'d','ধ':'dh','ন':'n',
 'প':'p','ফ':'ph','ব':'b','ভ':'bh','ম':'m',
 'য':'y','র':'r','ল':'l','শ':'sh','ষ':'sh','স':'s','হ':'h',
 'ড়':'r','ঢ়':'rh','য়':'y'
}
VOWELS = {'া':'a','ি':'i','ী':'i','ু':'u','ূ':'u','ে':'e','ৈ':'oi','ো':'o','ৌ':'ou'}
INDEP_V = {'অ':'a','আ':'a','ই':'i','ঈ':'i','উ':'u','ঊ':'u','এ':'e','ঐ':'oi','ও':'o','ঔ':'ou'}
SPECIAL = {'ং':'ng','ঃ':'h','ঁ':'n','্':''}
BENGALI_CHAR_RE = re.compile(r'[\u0980-\u09FF]')

# Common wordwise map (prefer conventional spellings)
COMMON_MAP = {
    "শেখ":"Sheikh", "শেখ্":"Sheikh", "শেখা":"Sheikh",
    "জাকির":"Zakir","হোসেন":"Hossain","হক":"Haq","হক্":"Haq",
    "মন্ডল":"Mondal","মণ্ডল":"Mondal","মন্ডল্":"Mondal",
    "আব্দুর":"Abdur","রশিদ":"Rashid","ইসলাম":"Islam","নুরুল":"Nurul",
    "সামাউন":"Samaun","সেলিম":"Selim","ইদ্রিশ":"Idrish","ইন্দ্রানী":"Indrani",
    "আনসারুল":"Ansarul","নসিবুদ্দিন":"Nasibuddin","এনামুল":"Enamul",
    "হানিফ":"Hanif","মোহাম্মদ":"Mohammad","মহঃ":"Moh."
}

# -------------------------
# PDF extraction helpers
# -------------------------
def extract_with_pymupdf(path: str):
    try:
        import fitz
    except Exception:
        return None
    try:
        doc = fitz.open(path)
        pages = [p.get_text("text") or "" for p in doc]
        return "\n".join(pages).strip() or None
    except Exception:
        return None

def extract_with_pdfplumber(path: str):
    try:
        import pdfplumber
    except Exception:
        return None
    try:
        pages = []
        with pdfplumber.open(path) as pdf:
            for p in pdf.pages:
                pages.append(p.extract_text() or "")
        return "\n".join(pages).strip() or None
    except Exception:
        return None

def ocr_extract(path: str, dpi=300, lang="ben+eng"):
    try:
        from pdf2image import convert_from_path
        import pytesseract
    except Exception:
        return None
    try:
        imgs = convert_from_path(path, dpi=dpi)
        texts = [pytesseract.image_to_string(img, lang=lang) for img in imgs]
        return "\n".join(texts).strip() or None
    except Exception:
        return None

def extract_text_auto(path: str):
    t = extract_with_pymupdf(path)
    if t:
        print("✓ Extracted using PyMuPDF")
        return t
    t = extract_with_pdfplumber(path)
    if t:
        print("✓ Extracted using pdfplumber")
        return t
    print("No searchable text; trying OCR (requires pdf2image/pytesseract & system tesseract/poppler)...")
    t = ocr_extract(path)
    if t:
        print("✓ Extracted using OCR")
        return t
    raise RuntimeError("Failed to extract text. Install PyMuPDF/pdfplumber or OCR dependencies.")

# -------------------------
# Cleaning & parsing helpers
# -------------------------
def fix_bengali_spacing(text: str) -> str:
    # join short Bengali fragments that were split by OCR
    def join_line(line):
        toks = line.split()
        buf, out = [], []
        for tok in toks:
            if re.fullmatch(rf'[{chr(0x0980)}-{chr(0x09FF)}]{{1,3}}', tok):
                buf.append(tok)
            else:
                if buf:
                    out.append("".join(buf)); buf=[]
                out.append(tok)
        if buf: out.append("".join(buf))
        return " ".join(out)
    return "\n".join(join_line(l) for l in text.splitlines())

NOISE_PATTERNS = [
    r"Banglarbhumi", r"বাংলা\s*ভুমি", r"^জেলা[:\s]", r"^ব্লক[:\s]", r"^মৌজা[:\s]",
    r"^\(Live Data As On", r"^জে\.এল\s*নং", r"^থানা[:\s]", r"দাগ\s*নং", r"খতিয়ান",
    r"রায়তের", r"পিতা\/স্বামী", r"অংশপরিমাণ", r"দখলদার", r"মন্তব্য", r"শ্রেণী", r"জমির মোট", r"দাগের ম্যাপ"
]

def remove_noise_lines(text: str):
    text = re.sub(r"Click\s*Here", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\bNil\b", "", text)
    text = re.sub(r"ব্?যা\s?ক্তি", "", text)
    text = re.sub(r"ব্যক্তি", "", text)
    out_lines = []
    for ln in text.splitlines():
        s = ln.strip()
        if not s: continue
        skip = False
        for p in NOISE_PATTERNS:
            if re.search(p, s, flags=re.IGNORECASE):
                skip = True; break
        if skip: continue
        s = re.sub(r"Click\s*Here", "", s, flags=re.IGNORECASE)
        s = s.strip()
        if s: out_lines.append(s)
    return out_lines

def split_blocks_from_lines(lines):
    blocks=[]
    cur=[]
    for ln in lines:
        if ln.strip()=="":
            if cur:
                blocks.append("\n".join(cur).strip()); cur=[]
        else:
            cur.append(ln)
    if cur: blocks.append("\n".join(cur).strip())
    # split combined blocks by khatian pattern if necessary
    kh_pat = re.compile(r'^\d{1,4}(?:/\d+)?$')
    final=[]
    for b in blocks:
        lns=[l.strip() for l in b.splitlines() if l.strip()]
        if any(kh_pat.match(l) for l in lns):
            cur=[]
            for l in lns:
                if kh_pat.match(l):
                    if cur: final.append("\n".join(cur).strip())
                    cur=[l]
                else:
                    cur.append(l)
            if cur: final.append("\n".join(cur).strip())
        else:
            final.append(b)
    return final

def parse_block_exact(block: str) -> Dict[str, Any]:
    lines=[ln for ln in block.splitlines() if ln.strip()]
    rec = {"khatian":None, "raiter_name":None, "father_or_husband":None,
           "area_ekor":None, "area_other":None, "nil_1":None, "nil_2":None,
           "remarks":None, "raw_block": block}
    if not lines: return rec
    kh_pat = re.compile(r'^\d{1,4}(?:/\d+)?$')
    idx=0
    if kh_pat.match(lines[0]):
        rec['khatian']=lines[0]; idx=1
    else:
        for i,l in enumerate(lines):
            if kh_pat.match(l):
                rec['khatian']=l; idx=i+1; break
    if idx < len(lines):
        rec['raiter_name']=lines[idx]; idx+=1
    if idx < len(lines):
        if re.search(r'\d', lines[idx]) and ('|' in lines[idx] or re.search(r'\d\.\d+', lines[idx])):
            area_line = lines[idx]; idx+=1
        else:
            rec['father_or_husband']=lines[idx]; idx+=1
            area_line = lines[idx] if idx < len(lines) else ""
    else:
        area_line = ""
    if idx < len(lines):
        tail = " ".join(lines[idx:])
        if area_line:
            area_line = area_line + " " + tail
        else:
            rec['remarks'] = tail
    if area_line:
        parts = [p.strip() for p in re.split(r'--', area_line, maxsplit=1)]
        main = parts[0]; remark = parts[1] if len(parts)>1 else None
        comps = [c.strip() for c in re.split(r'\||\s{2,}', main) if c.strip()]
        if len(comps)==1:
            comps = [c.strip() for c in re.split(r'\s*\|\s*|\s+', main) if c.strip()]
        if len(comps)>=1: rec['area_ekor']=comps[0]
        if len(comps)>=2: rec['area_other']=comps[1]
        if len(comps)>=3: rec['nil_1']=comps[2]
        if len(comps)>=4: rec['nil_2']=comps[3]
        if remark: rec['remarks']=remark
    # normalize empty strings to None
    for k,v in list(rec.items()):
        if isinstance(v,str) and v.strip()=="":
            rec[k]=None
    return rec

# -------------------------
# Transliteration + dynamic normalization
# -------------------------
def apply_common_map_wordwise(s: str) -> str:
    if not s or not BENGALI_CHAR_RE.search(s):
        return s
    keys = sorted(COMMON_MAP.keys(), key=len, reverse=True)
    out = s
    for k in keys:
        out = re.sub(r'(?<!\w)'+re.escape(k)+r'(?!\w)', COMMON_MAP[k], out)
    return out

def fallback_transliterate(s: str) -> str:
    out=[]; i=0
    while i < len(s):
        ch=s[i]
        if not BENGALI_CHAR_RE.match(ch):
            out.append(ch); i+=1; continue
        if ch in INDEP_V:
            out.append(INDEP_V[ch]); i+=1; continue
        if ch in CONS:
            base = CONS[ch]
            nxt = s[i+1] if i+1 < len(s) else ''
            if nxt == '্':
                out.append(base); i+=2; continue
            if nxt in VOWELS:
                out.append(base + VOWELS[nxt]); i+=2; continue
            out.append(base + 'a'); i+=1; continue
        if ch in VOWELS:
            out.append(VOWELS[ch]); i+=1; continue
        if ch in SPECIAL:
            out.append(SPECIAL[ch]); i+=1; continue
        i+=1
    result = "".join(out)
    result = re.sub(r'\s+', ' ', result).strip()
    result = " ".join([w.capitalize() if w.isalpha() else w for w in result.split()])
    return result

def transliterate_candidate(s: str) -> str:
    if s is None: return None
    s = s.strip()
    if s == "": return s
    # First apply common map for direct replacements
    step1 = apply_common_map_wordwise(s)
    # If no bengali chars remain, return title-cased ASCII
    if not BENGALI_CHAR_RE.search(step1):
        return " ".join([w.capitalize() for w in re.split(r'\s+', step1) if w])
    # Otherwise fallback transliterate remaining
    return fallback_transliterate(step1)

def normalize_using_seed_and_cache(candidate: str, cache: Dict[str,str], seed_list = None, use_fuzzy=True) -> str:
    """
    - candidate: ASCII transliteration candidate (from transliterate_candidate)
    - cache: dict stored persistently mapping original raw -> chosen canonical
    - seed_list: list of canonical English names to prefer (COMMON_SET)
    - use_fuzzy: if True, try to match to seed_list via difflib
    """
    if candidate is None: return None
    cand_norm = re.sub(r'[^\w\s]', ' ', str(candidate)).strip()
    if cand_norm == "": return candidate
    # If exact cached mapping exists (key is cand_norm lower)
    key = cand_norm.lower()
    if key in cache:
        return cache[key]
    # If candidate already resembles a seed entry closely, use it
    if seed_list:
        # try exact case-insensitive match first
        for s in seed_list:
            if cand_norm.lower() == s.lower():
                cache[key] = s
                return s
        # fuzzy match
        if use_fuzzy:
            matches = get_close_matches(cand_norm, seed_list, n=3, cutoff=0.78)  # tuned cutoff
            if matches:
                chosen = matches[0]
                cache[key] = chosen
                return chosen
    # Otherwise keep candidate but normalize (title case)
    final = " ".join([w.capitalize() for w in cand_norm.split()])
    cache[key] = final
    return final

# -------------------------
# Batch helpers: cache persistency
# -------------------------
def load_cache(path: Path) -> Dict[str,str]:
    if path.exists():
        try:
            data = json.loads(path.read_text(encoding='utf-8'))
            # ensure keys lowercased
            return {k.lower():v for k,v in data.items()}
        except Exception:
            return {}
    return {}

def save_cache(path: Path, cache: Dict[str,str]):
    path.write_text(json.dumps(cache, ensure_ascii=False, indent=2), encoding='utf-8')

# -------------------------
# Pipeline: parse + transliterate values using cache & seed
# -------------------------
def pipeline_single_pdf(pdf_path: Path, out_dir: Path, cache_path: Path, seed_list):
    raw_text = extract_text_auto(str(pdf_path))
    fixed = fix_bengali_spacing(raw_text)
    cleaned_lines = remove_noise_lines(fixed)
    blocks = split_blocks_from_lines(cleaned_lines)
    records = [parse_block_exact(b) for b in blocks]

    parsed = {"source_pdf": pdf_path.name, "records": records, "metadata": extract_metadata(raw_text)}
    out_parsed = out_dir / f"{pdf_path.stem}_parsed.json"
    out_parsed.write_text(json.dumps(parsed, ensure_ascii=False, indent=2), encoding='utf-8')
    print("Saved parsed JSON:", out_parsed)

    # load cache
    cache = load_cache(cache_path)
    # transliterate & normalize all string values in parsed
    def transliterate_node(node):
        if isinstance(node, dict):
            return {k: transliterate_node(v) for k,v in node.items()}
        if isinstance(node, list):
            return [transliterate_node(v) for v in node]
        if isinstance(node, str):
            # Try to transliterate the string (rule-based)
            cand = transliterate_candidate(node)
            # Then normalize using cache + seed fuzzy matching
            norm = normalize_using_seed_and_cache(cand, cache, seed_list)
            return norm
        return node

    # note: we'll transliterate values but keep keys same
    translated = {"source_pdf": parsed["source_pdf"], "metadata": {}, "records": []}
    # transliterate metadata values
    if isinstance(parsed.get("metadata"), dict):
        for k,v in parsed["metadata"].items():
            translated["metadata"][k] = transliterate_candidate(v) if isinstance(v,str) else v
    # transliterate records
    for rec in parsed["records"]:
        newrec = {}
        for k,v in rec.items():
            if isinstance(v,str):
                cand = transliterate_candidate(v)
                newrec[k] = normalize_using_seed_and_cache(cand, cache, seed_list)
            else:
                newrec[k] = v
        translated["records"].append(newrec)

    out_en = out_dir / f"{pdf_path.stem}_parsed_en_values.json"
    out_en.write_text(json.dumps(translated, ensure_ascii=False, indent=2), encoding='utf-8')
    print("Saved transliterated JSON:", out_en)

    # persist cache after processing
    save_cache(cache_path, cache)
    print("Updated transliteration cache:", cache_path)
    return out_parsed, out_en

# -------------------------
# Minimal metadata extraction (keeps original raw for trace)
# -------------------------
def extract_metadata(raw_text: str):
    meta = {}
    m = re.search(r"^জেলা[:\s]*(.+)$", raw_text, flags=re.MULTILINE)
    if m: meta['district'] = m.group(1).strip()
    m = re.search(r"^ব্লক[:\s]*(.+)$", raw_text, flags=re.MULTILINE)
    if m: meta['block'] = m.group(1).strip()
    m = re.search(r"^মৌজা[:\s]*(.+)$", raw_text, flags=re.MULTILINE)
    if m: meta['mouza'] = m.group(1).strip()
    m = re.search(r"\(Live Data As On\s*(.+?)\)", raw_text)
    if m: meta['live_data'] = m.group(1).strip()
    m = re.search(r"^জে\.এল\s*নং[:\s]*(.+)$", raw_text, flags=re.MULTILINE)
    if m: meta['jl_no'] = m.group(1).strip()
    m = re.search(r"^থানা[:\s]*(.+)$", raw_text, flags=re.MULTILINE)
    if m: meta['thana'] = m.group(1).strip()
    return meta

# -------------------------
# CLI
# -------------------------
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("pdf", help="Path to PDF")
    ap.add_argument("--out-dir", help="Output directory (default: current directory)", default=".")
    ap.add_argument("--cache", help="Path to transliteration cache JSON (default: translit_cache.json)", default="translit_cache.json")
    ap.add_argument("--no-fuzzy", help="Disable fuzzy matching to seed list", action="store_true")
    args = ap.parse_args()

    pdf_path = Path(args.pdf)
    if not pdf_path.exists():
        print("Error: PDF not found:", pdf_path, file=sys.stderr); sys.exit(2)
    out_dir = Path(args.out_dir); out_dir.mkdir(parents=True, exist_ok=True)
    cache_path = Path(args.cache)
    seed_list = COMMON_SET

    # if user disabled fuzzy matching, pass None for seed_list in normalize
    if args.no_fuzzy:
        seed_list = None

    try:
        pipeline_single_pdf(pdf_path, out_dir, cache_path, seed_list)
    except Exception as e:
        print("Error during processing:", e, file=sys.stderr)
        raise

if __name__ == "__main__":
    main()