# import pdfplumber
# import re
# import json
# import os
# import logging

# # ---------------- CONFIG ---------------- #


# request_payload = {
#     "JL No.": ["63", "124","190"],
#     "daag No": ["371", "261","519"]
# }

# # --------------------------------------- #



# # 🔕 Suppress pdfminer font warnings
# logging.getLogger("pdfminer").setLevel(logging.ERROR)

# PDF_FOLDER = "/var/www/html/land-ocr/input_pdfs"   # folder containing PDFs


# JL_SET = set(request_payload["JL No."])
# DAAG_SET = set(request_payload["daag No"])

# results = []

# def scan_pdf(pdf_path):
#     output = {
#         "pdf": os.path.basename(pdf_path),
#         "matches": []
#     }

#     with pdfplumber.open(pdf_path) as pdf:
#         for page_no, page in enumerate(pdf.pages, start=1):
#             words = page.extract_words(use_text_flow=True) or []
#             page_text = " ".join(w["text"] for w in words)

#             found_jl = []
#             found_daag = []

#             # JL No detection
#             for jl in JL_SET:
#                 if re.search(rf"\b{jl}\b", page_text):
#                     found_jl.append(jl)

#             # Daag No detection
#             for daag in DAAG_SET:
#                 if re.search(rf"\b{daag}\b", page_text):
#                     found_daag.append(daag)

#             if found_jl or found_daag:
#                 output["matches"].append({
#                     "page": page_no,
#                     "JL_No": found_jl,
#                     "Daag_No": found_daag
#                 })

#     if output["matches"]:
#         return output
#     return None


# for file in os.listdir(PDF_FOLDER):
#     if file.lower().endswith(".pdf"):
#         result = scan_pdf(os.path.join(PDF_FOLDER, file))
#         if result:
#             results.append(result)

# print(json.dumps(results, indent=2, ensure_ascii=False))

# import pdfplumber
# import pytesseract
# import re
# import os
# import json
# import logging
# from pdf2image import convert_from_path

# # ---------------- CONFIG ---------------- #

# PDF_FOLDER = "/var/www/html/land-ocr/input_pdfs"


# request_payload = {
#     "JL No.": ["63", "124", "999"],
#     "daag No": ["261", "371", "888"]
# }

# OCR_LANG = "ben+eng"
# TEXT_MIN_LEN = 30

# # -------------------------------------- #

# logging.getLogger("pdfminer").setLevel(logging.ERROR)

# REQ_JL = set(request_payload["JL No."])
# REQ_DAAG = set(request_payload["daag No"])

# FOUND_JL = set()
# FOUND_DAAG = set()

# def extract_text_pdf(page):
#     words = page.extract_words(use_text_flow=True) or []
#     return " ".join(w["text"] for w in words)


# def extract_text_ocr(pdf_path, page_no):
#     images = convert_from_path(
#         pdf_path,
#         first_page=page_no,
#         last_page=page_no,
#         dpi=300
#     )
#     return pytesseract.image_to_string(images[0], lang=OCR_LANG)


# def search(text):
#     for jl in REQ_JL:
#         if re.search(rf"\b{jl}\b", text):
#             FOUND_JL.add(jl)

#     for daag in REQ_DAAG:
#         if re.search(rf"\b{daag}\b", text):
#             FOUND_DAAG.add(daag)


# for file in os.listdir(PDF_FOLDER):
#     if not file.lower().endswith(".pdf"):
#         continue

#     pdf_path = os.path.join(PDF_FOLDER, file)

#     with pdfplumber.open(pdf_path) as pdf:
#         for page_no, page in enumerate(pdf.pages, start=1):

#             text = extract_text_pdf(page)

#             if not text or len(text) < TEXT_MIN_LEN:
#                 text = extract_text_ocr(pdf_path, page_no)

#             search(text)

# # ---------------- FINAL OUTPUT ---------------- #

# output = {
#     "found": {
#         "JL_No": sorted(FOUND_JL),
#         "Daag_No": sorted(FOUND_DAAG)
#     },
#     "not_found": {
#         "JL_No": sorted(REQ_JL - FOUND_JL),
#         "Daag_No": sorted(REQ_DAAG - FOUND_DAAG)
#     }
# }

# print(output)

# import pdfplumber
# import pytesseract
# import re
# import os
# import requests
# import tempfile
# import logging
# from pdf2image import convert_from_path

# # ---------------- INPUT ---------------- #

# pdf_urls = [
#     "https://example.com/261__BRAHMANBAHARA__MAYURESWAR-2.pdf",
#     "https://example.com/371(Plot No)_Suahanta Mondal.pdf"
# ]

# request_payload = {
#     "JL No.": ["63", "124", "999"],
#     "daag No": ["261", "371", "888"]
# }

# # ------------------------------------- #

# OCR_LANG = "ben+eng"
# TEXT_MIN_LEN = 30

# logging.getLogger("pdfminer").setLevel(logging.ERROR)

# REQ_JL = set(request_payload["JL No."])
# REQ_DAAG = set(request_payload["daag No"])

# FOUND_JL = set()
# FOUND_DAAG = set()


# def download_pdf(url, dest_folder):
#     local_path = os.path.join(dest_folder, os.path.basename(url))
#     r = requests.get(url, timeout=30)
#     r.raise_for_status()
#     with open(local_path, "wb") as f:
#         f.write(r.content)
#     return local_path


# def extract_text_pdf(page):
#     words = page.extract_words(use_text_flow=True) or []
#     return " ".join(w["text"] for w in words)


# def extract_text_ocr(pdf_path, page_no):
#     images = convert_from_path(
#         pdf_path,
#         first_page=page_no,
#         last_page=page_no,
#         dpi=300
#     )
#     return pytesseract.image_to_string(images[0], lang=OCR_LANG)


# def search(text):
#     for jl in REQ_JL:
#         if re.search(rf"\b{jl}\b", text):
#             FOUND_JL.add(jl)

#     for daag in REQ_DAAG:
#         if re.search(rf"\b{daag}\b", text):
#             FOUND_DAAG.add(daag)


# # ---------------- MAIN ---------------- #

# with tempfile.TemporaryDirectory() as tmpdir:

#     for url in pdf_urls:
#         try:
#             pdf_path = download_pdf(url, tmpdir)

#             with pdfplumber.open(pdf_path) as pdf:
#                 for page_no, page in enumerate(pdf.pages, start=1):

#                     text = extract_text_pdf(page)

#                     if not text or len(text) < TEXT_MIN_LEN:
#                         text = extract_text_ocr(pdf_path, page_no)

#                     search(text)

#         except Exception as e:
#             print(f"Skipped {url}: {e}")

# # ---------------- OUTPUT ---------------- #

# output = {
#     "found": {
#         "JL_No": sorted(FOUND_JL),
#         "Daag_No": sorted(FOUND_DAAG)
#     },
#     "not_found": {
#         "JL_No": sorted(REQ_JL - FOUND_JL),
#         "Daag_No": sorted(REQ_DAAG - FOUND_DAAG)
#     }
# }

# print(output)

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import pdfplumber
import pytesseract
import re
import os
import requests
import tempfile
import logging
from pdf2image import convert_from_path
from concurrent.futures import ThreadPoolExecutor, as_completed

OCR_LANG = "ben+eng"
TEXT_MIN_LEN = 30
DOWNLOAD_TIMEOUT = 20
OCR_WORKERS = 6

logging.getLogger("pdfminer").setLevel(logging.ERROR)

app = FastAPI(title="Land PDF Search API")

class SearchRequest(BaseModel):
    pdf_urls: list[str]
    JL_No: list[str]
    Daag_No: list[str]


def download_pdf(url, folder):
    path = os.path.join(folder, os.path.basename(url))
    r = requests.get(url, timeout=DOWNLOAD_TIMEOUT)
    r.raise_for_status()
    with open(path, "wb") as f:
        f.write(r.content)
    return path


def extract_text_pdf(page):
    words = page.extract_words(use_text_flow=True) or []
    return " ".join(w["text"] for w in words)


def extract_text_ocr(pdf_path, page_no):
    images = convert_from_path(
        pdf_path,
        first_page=page_no,
        last_page=page_no,
        dpi=300
    )
    return pytesseract.image_to_string(images[0], lang=OCR_LANG)


# def scan_page(pdf_path, page_no, page, jl_set, daag_set):
#     text = extract_text_pdf(page)

#     if not text or len(text) < TEXT_MIN_LEN:
#         text = extract_text_ocr(pdf_path, page_no)

#     found_jl = {jl for jl in jl_set if re.search(rf"\b{jl}\b", text)}
#     found_daag = {d for d in daag_set if re.search(rf"\b{d}\b", text)}

#     return found_jl, found_daag


def scan_page(pdf_path, page_no, page, jl_set, daag_set):
    text = extract_text_pdf(page)

    if not text or len(text) < TEXT_MIN_LEN:
        text = extract_text_ocr(pdf_path, page_no)

    # 🔥 CRITICAL FIX
    text = normalize_digits(text)

    found_jl = {jl for jl in jl_set if re.search(rf"\b{jl}\b", text)}
    found_daag = {d for d in daag_set if re.search(rf"\b{d}\b", text)}

    return found_jl, found_daag



BN_TO_EN = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")

def normalize_text(text: str) -> str:
    # 1️⃣ Bengali → English digits
    text = text.translate(BN_TO_EN)

    # 2️⃣ Collapse spaced digits: "2 6 1" → "261"
    text = re.sub(r'(?<=\d)\s+(?=\d)', '', text)

    # 3️⃣ Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    return text



@app.post("/extract")
def search_land_records(payload: SearchRequest):

    req_jl = set(payload.JL_No)
    req_daag = set(payload.Daag_No)

    found_jl = set()
    found_daag = set()

    with tempfile.TemporaryDirectory() as tmpdir:
        for url in payload.pdf_urls:
            try:
                pdf_path = download_pdf(url, tmpdir)

                with pdfplumber.open(pdf_path) as pdf:
                    with ThreadPoolExecutor(max_workers=OCR_WORKERS) as executor:
                        futures = [
                            executor.submit(
                                scan_page,
                                pdf_path,
                                page_no + 1,
                                page,
                                req_jl,
                                req_daag
                            )
                            for page_no, page in enumerate(pdf.pages)
                        ]

                        for f in as_completed(futures):
                            jl, daag = f.result()
                            found_jl.update(jl)
                            found_daag.update(daag)

            except Exception:
                # partial success allowed
                continue

    response_body = {
        "found": {
            "JL_No": sorted(found_jl),
            "Daag_No": sorted(found_daag)
        },
        "not_found": {
            "JL_No": sorted(req_jl - found_jl),
            "Daag_No": sorted(req_daag - found_daag)
        }
    }

    # ✅ STATUS LOGIC
    if not response_body["not_found"]["JL_No"] and not response_body["not_found"]["Daag_No"]:
        return response_body

    # ❌ ANY MISS → 422
    raise HTTPException(status_code=422, detail=response_body)
