import json
import re
from google.cloud import translate_v2 as translate

# ---------- CONFIG ----------
INPUT_JSON = "/var/www/html/land-ocr/output/banglarbhumi_final.json"
OUTPUT_JSON = "/var/www/html/land-ocr/output/tables_en.json"
SOURCE_LANG = "bn"
TARGET_LANG = "en"

# ---------- INIT ----------
translator = translate.Client()

# ---------- CLEAN OCR TEXT ----------
def clean_text(text):
    if not text or not isinstance(text, str):
        return text

    # Remove null chars and OCR junk
    text = text.replace("\u0000", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# ---------- TRANSLATE ----------
def translate_text(text):
    if not text:
        return text

    result = translator.translate(
        text,
        source_language=SOURCE_LANG,
        target_language=TARGET_LANG
    )
    return result["translatedText"]

# ---------- MAIN ----------
def translate_json():
    with open(INPUT_JSON, "r", encoding="utf-8") as f:
        data = json.load(f)

    translated_data = []

    for table in data:
        new_table = {
            "page": table.get("page"),
            "table_index": table.get("table_index"),
            "headers": [],
            "rows": []
        }

        # Translate headers
        for header in table.get("headers", []):
            header_clean = clean_text(header)
            new_table["headers"].append(
                translate_text(header_clean) if header_clean else header_clean
            )

        # Translate rows
        for row in table.get("rows", []):
            new_row = []
            for cell in row:
                cell_clean = clean_text(cell)
                new_row.append(
                    translate_text(cell_clean) if cell_clean else cell_clean
                )
            new_table["rows"].append(new_row)

        translated_data.append(new_table)

    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(translated_data, f, ensure_ascii=False, indent=2)

    print("✅ Translation complete")
    print(f"📄 Output file: {OUTPUT_JSON}")

# ---------- RUN ----------
if __name__ == "__main__":
    translate_json()
