import os
import json
import cv2
from dotenv import load_dotenv
from pdf2image import convert_from_path
from utils import preprocess_image

from paddleocr import PaddleOCR
import pytesseract
from PIL import Image
import openai

# Load environment
load_dotenv()
PDF_PATH = "/var/www/html/land-ocr/input_pdfs/261__BRAHMANBAHARA__MAYURESWAR-2.pdf"
OUTPUT_JSON = os.getenv("OUTPUT_JSON", "banglar_bhumi_raw.json")
TEMP_IMG_DIR = "temp_images"
os.makedirs(TEMP_IMG_DIR, exist_ok=True)

openai.api_key = os.getenv("OPENAI_API_KEY")

# Try PaddleOCR
ocr_engine = None
use_paddle = False
try:
    ocr_engine = PaddleOCR(
        det_model_dir="models/ben/det",
        rec_model_dir="models/ben/rec",
        rec_char_dict_path="models/ben/ben_dict.txt",
        use_angle_cls=True
    )
    use_paddle = True
    print("✅ PaddleOCR initialized.")
except Exception as e:
    print(f"⚠️ PaddleOCR unavailable ({e}), using Tesseract fallback.")
    use_paddle = False

# Convert PDF to images
pages = convert_from_path(PDF_PATH, dpi=300)
results = []

for i, page in enumerate(pages):
    img_path = os.path.join(TEMP_IMG_DIR, f"page_{i}.png")
    page.save(img_path, "PNG")

    processed_img = preprocess_image(img_path)
    processed_img_path = os.path.join(TEMP_IMG_DIR, f"processed_{i}.png")
    cv2.imwrite(processed_img_path, processed_img)

    # OCR
    text = ""
    if use_paddle:
        try:
            ocr_result = ocr_engine.ocr(processed_img_path, cls=True)
            if ocr_result and ocr_result[0]:
                text = " ".join([line[1][0] for line in ocr_result[0]])
        except Exception as e:
            print(f"⚠️ PaddleOCR failed on page {i+1} ({e}), switching to Tesseract.")
            text = pytesseract.image_to_string(Image.open(processed_img_path), lang="ben")
    else:
        text = pytesseract.image_to_string(Image.open(processed_img_path), lang="ben")

    # Translate to English with OpenAI
    translated_text = text
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "Translate Bengali text into English."},
                {"role": "user", "content": text}
            ]
        )
        translated_text = response["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"⚠️ Translation failed ({e}), keeping raw text.")

    results.append({
        "page": i + 1,
        "raw_text": text.strip(),
        "english_text": translated_text
    })

# Save output
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"✅ OCR + Translation complete. Output saved to {OUTPUT_JSON}")