# import pdfplumber
# import re
# import json
# import os
# import logging

# # ---------------- CONFIG ---------------- #


# request_payload = {
#     "JL No.": ["63", "124","190"],
#     "daag No": ["371", "261","519"]
# }

# # --------------------------------------- #



# # 🔕 Suppress pdfminer font warnings
# logging.getLogger("pdfminer").setLevel(logging.ERROR)

# PDF_FOLDER = "/var/www/html/land-ocr/input_pdfs"   # folder containing PDFs


# JL_SET = set(request_payload["JL No."])
# DAAG_SET = set(request_payload["daag No"])

# results = []

# def scan_pdf(pdf_path):
#     output = {
#         "pdf": os.path.basename(pdf_path),
#         "matches": []
#     }

#     with pdfplumber.open(pdf_path) as pdf:
#         for page_no, page in enumerate(pdf.pages, start=1):
#             words = page.extract_words(use_text_flow=True) or []
#             page_text = " ".join(w["text"] for w in words)

#             found_jl = []
#             found_daag = []

#             # JL No detection
#             for jl in JL_SET:
#                 if re.search(rf"\b{jl}\b", page_text):
#                     found_jl.append(jl)

#             # Daag No detection
#             for daag in DAAG_SET:
#                 if re.search(rf"\b{daag}\b", page_text):
#                     found_daag.append(daag)

#             if found_jl or found_daag:
#                 output["matches"].append({
#                     "page": page_no,
#                     "JL_No": found_jl,
#                     "Daag_No": found_daag
#                 })

#     if output["matches"]:
#         return output
#     return None


# for file in os.listdir(PDF_FOLDER):
#     if file.lower().endswith(".pdf"):
#         result = scan_pdf(os.path.join(PDF_FOLDER, file))
#         if result:
#             results.append(result)

# print(json.dumps(results, indent=2, ensure_ascii=False))

# import pdfplumber
# import pytesseract
# import re
# import os
# import json
# import logging
# from pdf2image import convert_from_path

# # ---------------- CONFIG ---------------- #

# PDF_FOLDER = "/var/www/html/land-ocr/input_pdfs"


# request_payload = {
#     "JL No.": ["63", "124", "999"],
#     "daag No": ["261", "371", "888"]
# }

# OCR_LANG = "ben+eng"
# TEXT_MIN_LEN = 30

# # -------------------------------------- #

# logging.getLogger("pdfminer").setLevel(logging.ERROR)

# REQ_JL = set(request_payload["JL No."])
# REQ_DAAG = set(request_payload["daag No"])

# FOUND_JL = set()
# FOUND_DAAG = set()

# def extract_text_pdf(page):
#     words = page.extract_words(use_text_flow=True) or []
#     return " ".join(w["text"] for w in words)


# def extract_text_ocr(pdf_path, page_no):
#     images = convert_from_path(
#         pdf_path,
#         first_page=page_no,
#         last_page=page_no,
#         dpi=300
#     )
#     return pytesseract.image_to_string(images[0], lang=OCR_LANG)


# def search(text):
#     for jl in REQ_JL:
#         if re.search(rf"\b{jl}\b", text):
#             FOUND_JL.add(jl)

#     for daag in REQ_DAAG:
#         if re.search(rf"\b{daag}\b", text):
#             FOUND_DAAG.add(daag)


# for file in os.listdir(PDF_FOLDER):
#     if not file.lower().endswith(".pdf"):
#         continue

#     pdf_path = os.path.join(PDF_FOLDER, file)

#     with pdfplumber.open(pdf_path) as pdf:
#         for page_no, page in enumerate(pdf.pages, start=1):

#             text = extract_text_pdf(page)

#             if not text or len(text) < TEXT_MIN_LEN:
#                 text = extract_text_ocr(pdf_path, page_no)

#             search(text)

# # ---------------- FINAL OUTPUT ---------------- #

# output = {
#     "found": {
#         "JL_No": sorted(FOUND_JL),
#         "Daag_No": sorted(FOUND_DAAG)
#     },
#     "not_found": {
#         "JL_No": sorted(REQ_JL - FOUND_JL),
#         "Daag_No": sorted(REQ_DAAG - FOUND_DAAG)
#     }
# }

# print(output)

# import pdfplumber
# import pytesseract
# import re
# import os
# import requests
# import tempfile
# import logging
# from pdf2image import convert_from_path

# # ---------------- INPUT ---------------- #

# pdf_urls = [
#     "https://example.com/261__BRAHMANBAHARA__MAYURESWAR-2.pdf",
#     "https://example.com/371(Plot No)_Suahanta Mondal.pdf"
# ]

# request_payload = {
#     "JL No.": ["63", "124", "999"],
#     "daag No": ["261", "371", "888"]
# }

# # ------------------------------------- #

# OCR_LANG = "ben+eng"
# TEXT_MIN_LEN = 30

# logging.getLogger("pdfminer").setLevel(logging.ERROR)

# REQ_JL = set(request_payload["JL No."])
# REQ_DAAG = set(request_payload["daag No"])

# FOUND_JL = set()
# FOUND_DAAG = set()


# def download_pdf(url, dest_folder):
#     local_path = os.path.join(dest_folder, os.path.basename(url))
#     r = requests.get(url, timeout=30)
#     r.raise_for_status()
#     with open(local_path, "wb") as f:
#         f.write(r.content)
#     return local_path


# def extract_text_pdf(page):
#     words = page.extract_words(use_text_flow=True) or []
#     return " ".join(w["text"] for w in words)


# def extract_text_ocr(pdf_path, page_no):
#     images = convert_from_path(
#         pdf_path,
#         first_page=page_no,
#         last_page=page_no,
#         dpi=300
#     )
#     return pytesseract.image_to_string(images[0], lang=OCR_LANG)


# def search(text):
#     for jl in REQ_JL:
#         if re.search(rf"\b{jl}\b", text):
#             FOUND_JL.add(jl)

#     for daag in REQ_DAAG:
#         if re.search(rf"\b{daag}\b", text):
#             FOUND_DAAG.add(daag)


# # ---------------- MAIN ---------------- #

# with tempfile.TemporaryDirectory() as tmpdir:

#     for url in pdf_urls:
#         try:
#             pdf_path = download_pdf(url, tmpdir)

#             with pdfplumber.open(pdf_path) as pdf:
#                 for page_no, page in enumerate(pdf.pages, start=1):

#                     text = extract_text_pdf(page)

#                     if not text or len(text) < TEXT_MIN_LEN:
#                         text = extract_text_ocr(pdf_path, page_no)

#                     search(text)

#         except Exception as e:
#             print(f"Skipped {url}: {e}")

# # ---------------- OUTPUT ---------------- #

# output = {
#     "found": {
#         "JL_No": sorted(FOUND_JL),
#         "Daag_No": sorted(FOUND_DAAG)
#     },
#     "not_found": {
#         "JL_No": sorted(REQ_JL - FOUND_JL),
#         "Daag_No": sorted(REQ_DAAG - FOUND_DAAG)
#     }
# }

# print(output)


from fastapi import FastAPI, HTTPException,Request
from pydantic import BaseModel
import pytesseract
from pytesseract import Output
import re
import requests
import tempfile
import pdfplumber
import os
from pdf2image import convert_from_path
from PIL import Image
from fastapi.responses import JSONResponse



OCR_LANG = "ben+eng"
DPI = 400

app = FastAPI(title="Land PDF Search API")


class SearchRequest(BaseModel):
    pdf_urls: list[str]
    JL_No: list[str]
    Daag_No: list[str]


BN_TO_EN = str.maketrans("০১২৩৪৫৬৭৮৯", "0123456789")

def normalize_text(text: str) -> str:
    text = text.translate(BN_TO_EN)
    text = re.sub(r"\s+", " ", text)
    return text


def extract_jl_numbers(text: str) -> set[str]:
    return set(re.findall(r"\b\d{2,4}\b", text))


# def extract_daag_numbers(text: str) -> set[str]:
#     text = re.sub(r"দা\s*গ", "দাগ", text)
#     matches = re.findall(r"দাগ(?:\s*নং)?\s*(\d{2,4})", text)
#     return set(matches)


def extract_daag_from_pdf_text(pdf_path: str) -> set[str]:
    results = set()

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""

            text = text.translate(BN_TO_EN)
            text = re.sub(r"\s+", " ", text)

            # Match rows like: 261 সৈ য়ম 2.17 Click Here
            matches = re.findall(r"\b(\d{2,4})\s+[^0-9]+?\s+\d+\.\d+\s+Click", text)

            for m in matches:
                results.add(m)

    return results

def extract_daag_numbers(text: str) -> set[str]:
    """
    Extract Daag numbers from Daag ROWS like:
    '261 সৈ য়ম 2.17 Click Here'
    """

    results = set()

    # Normalize digits & Bengali spacing
    text = text.translate(BN_TO_EN)
    text = re.sub(r"দা\s*গ", "দাগ", text)

    # Split into logical lines
    lines = text.splitlines()

    for line in lines:
        line = line.strip()

        # Heuristic: Daag rows always contain "Click"
        if "Click" in line:
            match = re.search(r"\b\d{2,4}\b", line)
            if match:
                results.add(match.group(0))

    return results



def download_pdf(url: str, folder: str) -> str:
    path = os.path.join(folder, os.path.basename(url))
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    with open(path, "wb") as f:
        f.write(r.content)
    return path


@app.post("/extract")
def extract_land_data(payload: SearchRequest):

    req_jl = set(payload.JL_No)
    req_daag = set(payload.Daag_No)

    found_jl = set()
    found_daag = set()

    with tempfile.TemporaryDirectory() as tmpdir:
        for url in payload.pdf_urls:
            try:
                pdf_path = download_pdf(url, tmpdir)
                images = convert_from_path(pdf_path, dpi=DPI)

                for img in images:
                    text = pytesseract.image_to_string(
                        img,
                        lang=OCR_LANG,
                        config="--psm 6"
                    )

                    text = normalize_text(text)
                    print("Server started successfully"+text)  
                    found_jl.update(req_jl & extract_jl_numbers(text))
                    # 1️⃣ Try PDF text-layer
                    daag_from_text = extract_daag_from_pdf_text(pdf_path)
                    found_daag.update(req_daag & daag_from_text)

                    # 2️⃣ OCR fallback only if needed
                    if req_daag - found_daag:
                        found_daag.update(req_daag & extract_daag_numbers(text))


            except Exception:
                continue

    response = {
        "found": {
            "JL_No": sorted(found_jl),
            "Daag_No": sorted(found_daag)
        },
        "not_found": {
            "JL_No": sorted(req_jl - found_jl),
            "Daag_No": sorted(req_daag - found_daag)
        }
    }

    if not response["not_found"]["JL_No"] and not response["not_found"]["Daag_No"]:
        return response

    raise HTTPException(status_code=422, detail=response)




@app.post("/quick_check")
async def quick_check(request: Request):
    return JSONResponse(
        status_code=200,
        content={
            "found": {
                "JL_No": ["63", "124"],
                "Daag_No": ["261", "371"]
            },
            "not_found": {
                "JL_No": ["999"],
                "Daag_No": ["888"]
            }
        }
    )


