#!/usr/bin/env python3
"""
process_all_selectable_fallback.py

Batch-convert PDFs -> English text.
If PDF has selectable text, extract that (PyMuPDF). Otherwise fallback to image OCR (pytesseract).
Translates Bengali->English only if extracted text language is Bengali (translation step optional).
This script focuses on extraction for *generated/selectable* PDFs (fast, reliable).

Usage:
  python process_all_selectable_fallback.py --input /opt/bengali_translator/input_pdfs --output /opt/bengali_translator/outputs
"""
import argparse
import logging
import sys
import time
from pathlib import Path
from tqdm import tqdm

import fitz  # PyMuPDF
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter

# Optional translation pipeline: if you need translation, set this to True and configure model
USE_TRANSLATION = False
TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-bn-en"
TEXT_CHUNK = 3500

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger("selectable_extractor")

# ================= helpers =================

def extract_text_pymupdf(pdf_path):
    """Extract text from PDF using PyMuPDF (fast). Returns full text and pages count."""
    doc = fitz.open(pdf_path)
    texts = []
    for page in doc:
        # get_text("text") returns page text ('' if none)
        t = page.get_text("text")
        texts.append(t)
    return "\n\n".join(texts), len(doc)

def ocr_pdf_images(pdf_path, poppler_path=None, dpi=300, tesseract_lang="ben"):
    """Fallback: convert pages to images and OCR them. Returns full text and page count."""
    pages = convert_from_path(pdf_path, dpi=dpi, poppler_path=poppler_path)
    texts = []
    for img in pages:
        gray = img.convert("L")
        txt = pytesseract.image_to_string(gray, lang=tesseract_lang)
        texts.append(txt)
    return "\n\n".join(texts), len(pages)

def save_text_to_pdf(text, out_pdf_path):
    """Create a simple text PDF (ReportLab) from text."""
    c = canvas.Canvas(out_pdf_path, pagesize=letter)
    width, height = letter
    left, top = 40, height - 40
    line_height = 12
    for paragraph in text.split("\n\n"):
        y = top
        for line in paragraph.split("\n"):
            # naive wrap
            while line:
                chunk = line[:90]
                line = line[90:]
                c.drawString(left, y, chunk)
                y -= line_height
                if y < 40:
                    c.showPage()
                    y = top
        c.showPage()
    c.save()

# ================= main =================

def process_pdf(pdf_path: Path, out_dir: Path, poppler_path: str=None):
    """
    If the PDF is selectable (has text), extract using PyMuPDF.
    Otherwise fallback to OCR.
    Saves: <stem>.txt and <stem>_en.pdf and returns metadata dict.
    """
    stem = pdf_path.stem
    txt_out = out_dir / f"{stem}.txt"
    pdf_out = out_dir / f"{stem}_en.pdf"
    meta = {"pdf": str(pdf_path), "status": "ok", "pages": 0, "words": 0, "errors": ""}

    try:
        text, pages = extract_text_pymupdf(str(pdf_path))
        if text and len(text.strip()) >= 10:
            method = "selectable"
            logger.info("Extracted selectable text from %s (pages=%d)", pdf_path.name, pages)
        else:
            method = "ocr"
            logger.info("No selectable text found in %s — falling back to OCR", pdf_path.name)
            text, pages = ocr_pdf_images(str(pdf_path), poppler_path=poppler_path)
    except Exception as e:
        # if PyMuPDF fails, fallback to OCR
        logger.warning("PyMuPDF extraction failed for %s: %s — falling back to OCR", pdf_path.name, e)
        try:
            text, pages = ocr_pdf_images(str(pdf_path), poppler_path=poppler_path)
            method = "ocr"
        except Exception as e2:
            meta["status"] = "error"
            meta["errors"] = f"extract_failed: {e2}"
            return meta

    # Optionally: translate if it's in Bengali and USE_TRANSLATION True.
    # Here we assume the PDF is already English if selectable (per your message).
    combined = text.strip()
    try:
        txt_out.write_text(combined, encoding="utf-8")
        save_text_to_pdf(combined, str(pdf_out))
    except Exception as e:
        meta["status"] = "write_error"
        meta["errors"] = str(e)
        return meta

    meta["pages"] = pages
    meta["words"] = len(combined.split())
    meta["method"] = method
    return meta

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--input", "-i", required=True)
    ap.add_argument("--output", "-o", required=True)
    ap.add_argument("--poppler", default=None)
    args = ap.parse_args()

    in_dir = Path(args.input)
    out_dir = Path(args.output)
    out_dir.mkdir(parents=True, exist_ok=True)

    pdfs = sorted(in_dir.glob("*.pdf"))
    if not pdfs:
        logger.error("No PDFs found in %s", in_dir)
        return

    results = []
    for pdf in tqdm(pdfs, desc="Processing PDFs"):
        res = process_pdf(pdf, out_dir, poppler_path=args.poppler)
        results.append(res)

    # write summary.csv
    csv_path = out_dir / "summary.csv"
    import csv
    fieldnames = ["pdf", "status", "pages", "words", "method", "errors"]
    with csv_path.open("w", newline="", encoding="utf-8") as cf:
        writer = csv.DictWriter(cf, fieldnames=fieldnames)
        writer.writeheader()
        for r in results:
            writer.writerow({k: r.get(k, "") for k in fieldnames})

    logger.info("Done. Outputs in %s", out_dir)

if __name__ == "__main__":
    main()
