from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
from pdf2image import convert_from_bytes
import pytesseract
import cv2
import numpy as np
import re
import os
from tempfile import TemporaryDirectory

app = FastAPI()

ownership_labels = {
    "নিজস্ব জমি": "own",
    "লিজ নেওয়া জমি": "lease",
    "যৌথ মালিকানাধীন জমি": "joint"
}

section_headers = {
    "own": "কৃষকের নিজস্ব জমির জন্য প্রযোজ্য অংশ",
    "lease": "কৃষকের লিজ নেওয়া জমির জন্য প্রযোজ্য অংশ",
    "joint": "যৌথ মালিকানাধীন জমির জন্য প্রযোজ্য অংশ"
}

name_labels = {
    "own": ["কৃষকের নাম"],
    "lease": ["জমির মালিকের নাম", "চাষাবাদকারী কৃষকের নাম"],
    "joint": ["পরিবারের সদস্যের নাম"]
}

signature_labels = {
    "own": ["কৃষকের স্বাক্ষর"],
    "lease": ["জমির মালিকের স্বাক্ষর", "চাষাবাদকারী কৃষকের স্বাক্ষর"],
    "joint": ["পরিবারের সদস্যের স্বাক্ষর"]
}

def detect_ownership_from_text(text):
    line = next((l for l in text.splitlines() if "জমি মালিকানার ধরণ" in l), "")
    if not line:
        return None

    line = line.replace(":", "").replace("_", " ").replace("চর", "").strip()

    if "নিজ" in line and "লিজ" not in line and "যৌথ" not in line:
        return "own"
    elif "লিজ" in line:
        return "lease"
    elif "যৌথ" in line:
        return "joint"
    return None

def find_section(image, header_text):
    data = pytesseract.image_to_data(image, lang='ben', output_type=pytesseract.Output.DICT)
    for i, word in enumerate(data['text']):
        if header_text in word:
            x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
            return image[y-50:y+600, x-50:x+600]
    return None

def extract_name(segment_image, label_text):
    text = pytesseract.image_to_string(segment_image, lang='ben')
    match = re.search(fr"{label_text}[:\-]?\s*(\S+)", text)
    return match.group(1) if match else "Not found"

def extract_signature(segment_image, label, output_dir):
    gray = cv2.cvtColor(segment_image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    count = 0
    paths = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 100 and h < 100:
            sig = segment_image[y:y+h, x:x+w]
            path = os.path.join(output_dir, f"{label}_signature_{count}.jpg")
            cv2.imwrite(path, sig)
            paths.append(path)
            count += 1
    return paths

@app.post("/extract")
async def extract_fields(file: UploadFile = File(...)):
    with TemporaryDirectory() as tmpdir:
        pdf_bytes = await file.read()
        images = convert_from_bytes(pdf_bytes, dpi=300)

        result = {
            "ownership_types": [],
            "names": {},
            "signatures": {}
        }

        for img in images:
            img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
            text = pytesseract.image_to_string(img_cv, lang='ben+eng')
            ownership = detect_ownership_from_text(text)

            if ownership and ownership not in result["ownership_types"]:
                result["ownership_types"].append(ownership)

            section = find_section(img_cv, section_headers.get(ownership, "")) if ownership else None
            target = section if section is not None else img_cv

            for label in name_labels.get(ownership, []):
                if label not in result["names"]:
                    name = extract_name(target, label)
                    result["names"][label] = name

            for label in signature_labels.get(ownership, []):
                sig_paths = extract_signature(target, label, tmpdir)
                if label in result["signatures"]:
                    result["signatures"][label].extend(sig_paths)
                else:
                    result["signatures"][label] = sig_paths

        return result









# from fastapi import FastAPI, File, UploadFile
# from fastapi.responses import JSONResponse
# from pdf2image import convert_from_bytes
# import pytesseract
# import cv2
# import numpy as np
# import re
# import os
# from tempfile import TemporaryDirectory

# app = FastAPI()

# ownership_labels = {
#     "নিজস্ব জমি": "own",
#     "লিজ নেওয়া জমি": "lease",
#     "যৌথ মালিকানাধীন জমি": "joint"
# }

# section_headers = {
#     "own": "কৃষকের নিজস্ব জমির জন্য প্রযোজ্য অংশ",
#     "lease": "কৃষকের লিজ নেওয়া জমির জন্য প্রযোজ্য অংশ",
#     "joint": "যৌথ মালিকানাধীন জমির জন্য প্রযোজ্য অংশ"
# }

# name_labels = {
#     "own": ["কৃষকের নাম"],
#     "lease": ["জমির মালিকের নাম", "চাষাবাদকারী কৃষকের নাম"],
#     "joint": ["পরিবারের সদস্যের নাম"]
# }

# signature_labels = {
#     "own": ["কৃষকের স্বাক্ষর"],
#     "lease": ["জমির মালিকের স্বাক্ষর", "চাষাবাদকারী কৃষকের স্বাক্ষর"],
#     "joint": ["পরিবারের সদস্যের স্বাক্ষর"]
# }

# def detect_ownership_from_text(text):
#     line = next((l for l in text.splitlines() if "জমি মালিকানার ধরণ" in l), "")
#     if not line:
#         return None

#     line = line.replace(":", "").replace("_", " ").replace("চর", "").strip()

#     if "নিজ" in line and "লিজ" not in line and "যৌথ" not in line:
#         return "own"
#     elif "লিজ" in line:
#         return "lease"
#     elif "যৌথ" in line:
#         return "joint"
#     return None

# def find_section(image, header_text):
#     data = pytesseract.image_to_data(image, lang='ben', output_type=pytesseract.Output.DICT)
#     for i, word in enumerate(data['text']):
#         if header_text in word:
#             x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
#             return image[y-50:y+600, x-50:x+600]
#     return None

# def extract_name(segment_image, label_text):
#     text = pytesseract.image_to_string(segment_image, lang='ben')
#     match = re.search(fr"{label_text}[:\-]?\s*(\S+)", text)
#     return match.group(1) if match else "Not found"

# def extract_signature(segment_image, label, output_dir):
#     gray = cv2.cvtColor(segment_image, cv2.COLOR_BGR2GRAY)
#     _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
#     contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

#     count = 0
#     paths = []
#     for cnt in contours:
#         x, y, w, h = cv2.boundingRect(cnt)
#         if w > 100 and h < 100:
#             sig = segment_image[y:y+h, x:x+w]
#             path = os.path.join(output_dir, f"{label}_signature_{count}.jpg")
#             cv2.imwrite(path, sig)
#             paths.append(path)
#             count += 1
#     return paths

# @app.post("/extract")
# async def extract_fields(file: UploadFile = File(...)):
#     with TemporaryDirectory() as tmpdir:
#         pdf_bytes = await file.read()
#         images = convert_from_bytes(pdf_bytes, dpi=300)

#         result = {"ownership_type": None, "names": {}, "signatures": {}}

#         for img in images:
#             img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
#             text = pytesseract.image_to_string(img_cv, lang='ben+eng')
#             ownership = detect_ownership_from_text(text)
#             result["ownership_type"] = ownership

#             section = find_section(img_cv, section_headers.get(ownership, "")) if ownership else None
#             target = section if section is not None else img_cv

#             for label in name_labels.get(ownership, []):
#                 name = extract_name(target, label)
#                 result["names"][label] = name

#             for label in signature_labels.get(ownership, []):
#                 sig_paths = extract_signature(target, label, tmpdir)
#                 result["signatures"][label] = sig_paths

#             break  # process only first matching page

#         return result 






# def find_ownership_segment(image):
#     data = pytesseract.image_to_data(image, lang='ben', output_type=pytesseract.Output.DICT)
#     for i, word in enumerate(data['text']):
#         if "জমি মালিকানার ধরণ:" in word:
#             x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
#             return image[y-50:y+300, x-50:x+600]
#     return None

# def detect_ownership_from_segment(segment):
#     gray = cv2.cvtColor(segment, cv2.COLOR_BGR2GRAY)
#     _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
#     contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

#     for cnt in contours:
#         x, y, w, h = cv2.boundingRect(cnt)
#         if 10 < w < 50 and 10 < h < 50:
#             roi = segment[y:y+h, x+50:x+300]
#             text = pytesseract.image_to_string(roi, lang='ben')
#             for label in ownership_labels:
#                 if label in text:
#                     return ownership_labels[label]
#     return None


# @app.post("/extract")
# async def extract_fields(file: UploadFile = File(...)):
#     with TemporaryDirectory() as tmpdir:
#         pdf_bytes = await file.read()
#         images = convert_from_bytes(pdf_bytes, dpi=300)

#         result = {"ownership_type": None, "names": {}, "signatures": {}}
#         found = False

#         for img in images:
#             img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
#             segment = find_ownership_segment(img_cv)
#             if segment is not None:
#                 ownership = detect_ownership_from_segment(segment)
#                 if ownership:
#                     section = find_section(img_cv, section_headers[ownership])
#                     if section is not None:
#                         result["ownership_type"] = ownership
#                         for label in name_labels[ownership]:
#                             name = extract_name(section, label)
#                             result["names"][label] = name
#                         for label in signature_labels[ownership]:
#                             sig_paths = extract_signature(section, label, tmpdir)
#                             result["signatures"][label] = sig_paths
#                         found = True
#                         break

#         if not found:
#             return JSONResponse(content={"error": "Ownership type or section not found"}, status_code=400)

#         return result













#################################################################
# from fastapi import FastAPI, File, UploadFile
# from fastapi.responses import JSONResponse
# from pdf2image import convert_from_bytes
# import pytesseract
# import cv2
# import numpy as np
# import re
# import os
# from tempfile import TemporaryDirectory

# app = FastAPI()

# ownership_labels = {
#     "নিজস্ব জমি": "own",
#     "লিজ নেওয়া জমি": "lease",
#     "যৌথ মালিকানাধীন জমি": "joint"
# }

# section_headers = {
#     "own": "কৃষকের নিজস্ব জমির জন্য প্রযোজ্য অংশ",
#     "lease": "কৃষকের লিজ নেওয়া জমির জন্য প্রযোজ্য অংশ",
#     "joint": "যৌথ মালিকানাধীন জমির জন্য প্রযোজ্য অংশ"
# }

# name_labels = {
#     "own": ["কৃষকের নাম"],
#     "lease": ["জমির মালিকের নাম", "চাষাবাদকারী কৃষকের নাম"],
#     "joint": ["পরিবারের সদস্যের নাম"]
# }

# signature_labels = {
#     "own": ["কৃষকের স্বাক্ষর"],
#     "lease": ["জমির মালিকের স্বাক্ষর", "চাষাবাদকারী কৃষকের স্বাক্ষর"],
#     "joint": ["পরিবারের সদস্যের স্বাক্ষর"]
# }

# def detect_ownership(image):
#     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#     _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
#     contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

#     for cnt in contours:
#         x, y, w, h = cv2.boundingRect(cnt)
#         if 10 < w < 50 and 10 < h < 50:
#             roi = image[y:y+h, x+50:x+300]
#             text = pytesseract.image_to_string(roi, lang='ben')
#             for label in ownership_labels:
#                 if label in text:
#                     return ownership_labels[label]
#     return None

# def find_section(image, header_text):
#     data = pytesseract.image_to_data(image, lang='ben', output_type=pytesseract.Output.DICT)
#     for i, word in enumerate(data['text']):
#         if header_text in word:
#             x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
#             return image[y-50:y+600, x-50:x+600]
#     return None

# def extract_name(segment_image, label_text):
#     text = pytesseract.image_to_string(segment_image, lang='ben')
#     match = re.search(fr"{label_text}[:\-]?\s*(\S+)", text)
#     return match.group(1) if match else "Not found"

# def extract_signature(segment_image, label, output_dir):
#     gray = cv2.cvtColor(segment_image, cv2.COLOR_BGR2GRAY)
#     _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
#     contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

#     count = 0
#     paths = []
#     for cnt in contours:
#         x, y, w, h = cv2.boundingRect(cnt)
#         if w > 100 and h < 100:
#             sig = segment_image[y:y+h, x:x+w]
#             path = os.path.join(output_dir, f"{label}_signature_{count}.jpg")
#             cv2.imwrite(path, sig)
#             paths.append(path)
#             count += 1
#     return paths

# @app.post("/extract")
# async def extract_fields(file: UploadFile = File(...)):
#     with TemporaryDirectory() as tmpdir:
#         pdf_bytes = await file.read()
#         images = convert_from_bytes(pdf_bytes, dpi=300)

#         result = {"ownership_type": None, "names": {}, "signatures": {}}
#         found = False

#         for img in images:
#             img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
#             ownership = detect_ownership(img_cv)

#             if ownership:
#                 section = find_section(img_cv, section_headers[ownership])
#                 if section is not None:
#                     result["ownership_type"] = ownership
#                     for label in name_labels[ownership]:
#                         name = extract_name(section, label)
#                         result["names"][label] = name
#                     for label in signature_labels[ownership]:
#                         sig_paths = extract_signature(section, label, tmpdir)
#                         result["signatures"][label] = sig_paths
#                     found = True
#                     break

#         if not found:
#             return JSONResponse(content={"error": "Ownership type or section not found"}, status_code=400)

#         return result