import os
import json
from dotenv import load_dotenv
from pdf2image import convert_from_path
from PIL import Image
import cv2
import numpy as np
from openai import OpenAI
from paddleocr import PaddleOCR  # Ensure PaddleOCR is installed
import pytesseract

# Load environment variables (e.g., OPENAI_API_KEY)
load_dotenv()

# Paths and settings
PDF_PATH = "/var/www/html/land-ocr/input_pdfs/261__BRAHMANBAHARA__MAYURESWAR-2.pdf"
OUTPUT_JSON = os.getenv("OUTPUT_JSON", "banglar_bhumi_structured.json")
TEMP_IMG_DIR = "temp_images"
os.makedirs(TEMP_IMG_DIR, exist_ok=True)

# OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Image preprocessing function
def preprocess_image(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    # Denoise, enhance contrast, threshold for better OCR
    img = cv2.medianBlur(img, 3)  # Remove noise
    img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    # Optional: deskew if needed (using OpenCV or imutils)
    return img

# Initialize PaddleOCR (with Bengali models - assume downloaded)
ocr_engine = None
use_paddle = False
try:
    ocr_engine = PaddleOCR(
        det_model_dir="models/ben/det",
        rec_model_dir="models/ben/rec",
        rec_char_dict_path="models/ben/ben_dict.txt",
        use_angle_cls=True,
        lang='bn'  # Bengali
    )
    use_paddle = True
    print("✅ PaddleOCR initialized.")
except Exception as e:
    print(f"⚠️ PaddleOCR unavailable ({e}), using Tesseract fallback.")
    use_paddle = False

# Convert PDF to images and process
pages = convert_from_path(PDF_PATH, dpi=400)  # Higher DPI for better accuracy
results = []

for i, page in enumerate(pages):
    img_path = os.path.join(TEMP_IMG_DIR, f"page_{i}.png")
    page.save(img_path, "PNG")

    processed_img = preprocess_image(img_path)
    processed_img_path = os.path.join(TEMP_IMG_DIR, f"processed_{i}.png")
    cv2.imwrite(processed_img_path, processed_img)

    # OCR extraction
    raw_text = ""
    if use_paddle:
        try:
            ocr_result = ocr_engine.ocr(processed_img_path, cls=True)
            if ocr_result and ocr_result[0]:
                raw_text = "\n".join([line[1][0] for line in ocr_result[0]])
        except Exception as e:
            print(f"⚠️ PaddleOCR failed on page {i+1} ({e}), switching to Tesseract.")
            raw_text = pytesseract.image_to_string(Image.open(processed_img_path), lang="ben")
    else:
        raw_text = pytesseract.image_to_string(Image.open(processed_img_path), lang="ben", config='--psm 6')

    # Send raw OCR text to GPT-4 for structured extraction and translation
    structured_data = {}
    try:
        prompt = f"""
        You are an expert extractor for Banglarbhumi land records in Bengali.
        Extract structured data from this raw OCR text.
        Transliterate Bengali names to English romanization.
        Output ONLY valid JSON — no extra text.

        Extract:
        - plot_no (দাগ নং)
        - land_class (শ্রেণী) — translate to English (e.g., সৈয়ম = Pond)
        - total_area_acres

        Then the full owners table as a list of dicts with:
        - khatian_no
        - owner_name (English transliteration)
        - father_husband (English transliteration)
        - share
        - share_amount_acres
        - possessor (Nil or value)
        - remarks (Nil-- Remarks if present, else Nil)

        JSON structure:
        {{
          "plot_no": "",
          "land_class": "",
          "total_area_acres": "",
          "owners": [ {{ "khatian_no": "", "owner_name": "", "father_husband": "", "share": "", "share_amount_acres": "", "possessor": "", "remarks": "" }} ]
        }}

        Raw Text:
        {raw_text}
        """

        response = client.chat.completions.create(
        model="gpt-4o-mini",  # ← Changed here
        messages=[
            {"role": "system", "content": "You are a precise data extractor. Output only JSON."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=2048
    )
        structured_json = response.choices[0].message.content.strip()
        structured_data = json.loads(structured_json)
    except Exception as e:
        print(f"⚠️ GPT extraction failed ({e}), keeping raw text.")
        structured_data = {"raw_text": raw_text}

    results.append({
        "page": i + 1,
        "raw_text": raw_text,
        "structured_data": structured_data
    })

# Save full output to JSON
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"✅ OCR + Structured Extraction complete. Output saved to {OUTPUT_JSON}")