o
    &-i                     @   s
  d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZm	Z	 d dl
mZ d dlmZmZmZ d dlmZ d dlZd dlZd dlmZ d dlmZ d dlZd dlmZ d d	lmZ zd dlZd
ZW n eym   dZY nw zd dl m!Z! d dl"m#Z$ d
Z%W n ey   dZ%Y nw dZ&e j'e&d
d eddZ(e)dZ*ej)dej+dZ,ej)dej+dZ-ej)dej+dZ.dZ/dddZ0ddee1 de1de	ee1e1f  dee1 fddZ2d e1de1fd!d"Z3d e1de1fd#d$Z4d%d&d'd(Z5d)e1de1fd*d+Z6d,e1de1fd-d.Z7d)e1de1fd/d0Z8i d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdJZ9i dPdQdRdSdTdUdVdWdXdYdZd>d[d\d]d^d_d`dadbdcdUdddedfdgdhdidjdkdldmdndUdodFdpdqd@dkdrdsdbdt	Z:edudvddxe1dye1de1fdzd{Z;ddxe1dye1de1fd|d}Z<ddxe1de=dee1 fddZ>edudvdde1de=dee1 fddZ?dde1de=dee1 fddZ@dde1de=dee1 fddZAde1de1fddZBdPe1dRe1deCfddZDd&d&d&d'd&dZEde1de1fddZFdPe1dRe1deCfddZGde1de1deCfddZHdee1ef deee1ef  fddZIde1de1deCfddZJde1de1deCfddZKde1de1deCfddZLde1de1deCfddZM		dde1de1deCdeCdee1ef f
ddZNde1fddZOG dd deZPe(QdedfdePfddZRdS )    N)ListDictAnyOptional)	lru_cache)FastAPIHTTPExceptionBody)	BaseModel)extract_text)convert_from_path)fuzz)LevenshteinTF)	sanscript)transliterate	downloadsexist_okz1LandDoc OCR - IndicTrans Hybrid Matcher (patched))titlez[\u0980-\u09FF]ua   (?:দা\s*গ|দাগ|দাগ নং|দাগ:|দাগঃ)\s*[:\-]?\s*([0-9A-Za-z\/\-]+))flagsu   (?:খত(?:ি|িয়া|িয়|িয়|ান)?\s*(?:নং|ন|No\.?|No|number)?|khatian\s*(?:no|no\.|number))\s*[:\-]?\s*([0-9A-Za-z\/\-]+)u   (?:রায়ত(?:ের)?\s*নাম|রায়তের নাম|রায়ত নাম|মালিক(?:ের)?\s*নাম)   c              	      s   | j ||dd4 I d H 6}|  t|dI d H }|jd2 z3 d H W }||I d H  q!6 | I d H  W d   I d H  d S 1 I d H sIw   Y  d S )N<   )headerstimeoutwbi   )getraise_for_statusaiofilesopencontentiter_chunkedwriteclose)sessionurl	dest_pathr   respfchunk r)   */var/www/html/land-doc-ocr/chatgpt_main.pydownload_file3   s   .r+   urlsdest_dirr   returnc              	      s   t j|dd ttt 4 I d H Pg }t| D ]5\}}t j	|
dd d p.d}d| | }t j||}||f fdd		}	|t|	  qtj| I d H }
W d   I d H  |
S 1 I d H skw   Y  |
S )
NTr   ?r      z.pdfdoc_c              	      sV   4 I d H  t | | dI d H  W d   I d H  |S 1 I d H s$w   Y  |S )N)r   )r+   )upr   semsessr)   r*   _dlD   s   zdownload_all.<locals>._dl)osmakedirsasyncio	SemaphoreMAX_CONCURRENCYaiohttpClientSession	enumeratepathsplitextsplitjoinappendcreate_taskgather)r,   r-   r   tasksir$   extfnr@   r7   resr)   r4   r*   download_all;   s    
rL   r@   c                 C   sD   zt | }|rt| dkr|W S W dS W dS  ty!   Y dS w )NP    )r   lenstrip	Exception)r@   tr)   r)   r*   extract_text_from_pdfM   s   rS   c                 C   s8   g }t | dd}|D ]}|tj|dd q
d|S )N,  )dpizben+eng)lang
)r   rD   pytesseractimage_to_stringrC   )r@   partsimagesimgr)   r)   r*   ocr_pdfV   s
   
r]   u   মণ্ডল rN   )u   মন্ডল    u   ‌sc                 C   s"   t  D ]
\}}| ||} q| S N)COMMON_REPLACEMENTSitemsreplace)r`   kvr)   r)   r*   apply_common_replacements`   s   rg   namec                 C   s4   | sdS t dd| }t|}t dd| }|S )NrN   z[\x00-\x1F]+r^   \s+)resubrg   rP   )rh   r`   r)   r)   r*   normalize_bengali_namee   s   rl   c                 C   sP   | s| S d}| }t d}||kr|}|d|}||kst dd| }|S )z
    Remove erroneous spaces between Bengali characters introduced by OCR:
    join sequences where a Bengali codepoint is separated by spaces from another Bengali codepoint.
    Nz%([\u0980-\u09FF])\s+([\u0980-\u09FF])z\1\2ri   r^   )rj   compilerk   rP   )r`   prevcurpatternr)   r)   r*    collapse_bengali_internal_spacesm   s   
rq   khu   খghu   ঘchu   চjhu   ঝthu   থdhu   ধphu   ফbhu   ভshu   শshhu   ষssu   সngu   ঙnyu   ঞrru   ঢ়tru	   ত্রgnau   অbu   বcu   কdu   দeu   এr'   gu   গhu   হrH   u   ইju   জre   lu   লmu   মnu   নou   ওr3   u   পqu   রu   তu   উu	   ক্সu   য়)	rr`   rR   r2   rf   wxyzi   )maxsizeKOLKATAbn_name
scheme_outc                 C      t | |dS )Nr   )transliterate_forward_indic)r   r   r)   r)   r*   "transliterate_forward_indic_cached      r   c                 C   s   | sdS t | }tstdd|S tjtjtjtjd}d}zt	|tj
||tj}W n tyK   z
t	|tj
tj}W n tyH   d}Y nw Y nw tdd| }|S )NrN   ri   r^   )r   ITRANSIASTWX)rl   _HAS_INDIC_TRANSrj   rk   r   r   r   r   r   it_transliterateBENGALIr   rQ   rP   )r   r   bn
scheme_mapoutr)   r)   r*   r      s.   r      max_variantsc                    s  g t    fdd}t| dd}|| ||  ||dd ||dd  trXztt| tjtj	}|| ||  ||dd W n	 t
yW   Y nw td d  D ] }||dd t|d	kr||d d
  ||dd   q`d | S )Nc                    s>   | sd S t dd|  }| vr | | d S d S )Nri   r^   )rj   rk   rP   addrD   )r   x2seenvariantsr)   r*   r      s   
z&generate_forward_variants.<locals>.addr   r   r^   rN   rz   r`      r0   )setr   lowerrd   r   r   rl   r   r   r   rQ   listrO   )r   r   r   re   itrrf   r)   r   r*   generate_forward_variants   s$   8( r      en_namec                 C   r   )Nr   )reverse_transliterate_indic)r   r   r)   r)   r*   "reverse_transliterate_indic_cached   r   r   c           
   	   C   s@  | sg S t dd|  }t dd| }g }t }ts"t| |S tjtj	tj
tjg}|D ]0}t||kr8 n'zt||tj}t|}|rT||vrT|| || W q. ty^   Y q.w | }t||k rt|dkrz!td|tjtj}	t|	}	|	r|	|vr||	 ||	 W n	 ty   Y nw |d | S )Nz[^a-zA-Z0-9\s]r^   ri   r0   rN   )rj   rk   rP   r   r   r   _fallback_reverse_variantsr   r   r   r   r   rO   r   r   rl   r   rD   rQ   rB   rC   )
r   r   r`   r   r   stylessrcr   toksjoinedr)   r)   r*   r      sB   
r   c                    s   t dd|   }| }|sg S dtdtfdd  fdd|D }dd	d |D }d
dd |D }g }|rB|| |rM||krM|| |d | S )Nz[^a-z0-9\s]r^   tokr.   c                 S   s   d}g }|t | k ri|d t | kr.| ||d  tv r.|t| ||d    |d7 }q|d t | krR| ||d  tv rR|t| ||d    |d7 }q| | }|t|d |d7 }|t | k s
d|S )Nr   r      rN   r0   )rO   _DOUBLE_MAPrD   _SINGLE_MAPr   rC   )r   rH   r   rt   r)   r)   r*   roman_to_bn   s   $$$$
z/_fallback_reverse_variants.<locals>.roman_to_bnc                    s   g | ]} |qS r)   r)   .0rR   r   r)   r*   
<listcomp>       z._fallback_reverse_variants.<locals>.<listcomp>c                 S      g | ]}|r|qS r)   r)   r   r)   r)   r*   r      r   rN   c                 S   r   r)   r)   r   r)   r)   r*   r      r   )rj   rk   r   rP   rB   strrC   rD   )r   r   r`   r   	bn_tokensspacedr   r   r)   r   r*   r      s   r   r   c                 C   s   t | pdS )NrN   )rl   )r   r)   r)   r*   normalize_bn_for_compare   s   r   c                 C   sp   t | } t |}| r|sdS z
t| |d }W n ty'   t| |}Y nw t| |}d| d|  }|S )N              Y@ffffff?g333333?)r   r   normalized_similarityrQ   r   ratiopartial_ratio)r   r   levpartscorer)   r)   r*   bn_string_similarity   s   r   ).,-'z  api_namec                 C   sD   | pd   }t D ]
\}}|||}qtdd|  }|S )NrN   ri   r^   )rP   r   EN_REPLACE_MAPrc   rd   rj   rk   )r   r`   re   rf   r)   r)   r*   normalize_api_name  s
   r   c                 C   s   | pd   } |pd   }| r|sdS ts | |krdS dS zt| p'd}t|p.d}|r3|s6W dS t||W S  tyF   Y dS w )NrN   r   r   )rP   r   _HAS_JELLYFISH	jellyfish	metaphoner   r   rQ   )r   r   mambr)   r)   r*   token_phonetic_score  s    r   api_tokcand_tokc                 C   s   | pd  } |p	d  }| r|sdS t| |}t| |}t| |}z
t| |d }W n ty9   d}Y nw t| |}d| d|  d|  d|  d|  }|S )NrN   r   r   皙?g333333?gffffff?)	rP   r   r   r   token_sort_ratior   r   rQ   r   )r   r   t1t2t3r   phonr   r)   r)   r*   token_score_en$  s   
(r   file_mapc              	   C   s   g }|   D ]H\}}|di   D ];\}}||||d r$|d d nd|d |di |}|rM||krM||||d rG|d d nd|d qq|S )z
    Returns list of candidate dicts:
      { "bn": <bengali_string_variant>, "file": <file_url>, "daag_no": <>, "khatian_no": <> }
    includes both normalized and joined (no-space) variants for robustness.
    khatian_mapdaagsr   N)r   filedaag_no
khatian_nokhatian_map_joined)rc   r   rD   )r   r   file_urlfmaprr   r   r   r)   r)   r*   #generate_bn_candidates_from_filemap4  s   ((r   c              	   C   s   t | }|sdS t|dd}d}|D ]A}tdd|  }t||}t||}z
t	
||d }	W n ty@   d}	Y nw d| d|  d	|	  }
|
|krS|
}q|d S )
Nr   r   r   [^0-9a-z\s]r^   r   g?g      ?r   )r   r   rj   rk   r   rP   r   token_set_ratior   r   r   rQ   )r   r   api_normr   bestrf   v_normtsprr   combinedr)   r)   r*   match_forwardD  s&   r   c                 C   sH   | sdS t | dd}d}t|}|D ]}t||}||kr|}q|d S )Nr   r   r   r   )r   r   r   )r   r   rvr   bn_normcandscr)   r)   r*   match_reverseW  s   
r  bn1bn2c                 C   sV   | r|sdS t |  }t | }|r|sdS | d }| d }t||d S )Nr   r   r   )r   rP   rB   r   )r  r  r   r   a_lastb_lastr)   r)   r*   surname_similarity_bnc  s   r	  c                 C   s&  t |}| }|r|d nd}t| }| r| d nd}d}z*t| dd}|D ]}	|	 r7|	 d nd}
|
rJ|rJt||
d }||krJ|}q+W n tyW   d}Y nw d}z(t|ddpbd}|rttd	d
|	 
  d nd}|r|rt||d }W n ty   d}Y nw t||S )Nr   rN   r   r   r   r   r   r   r   r^   )r   rB   r   r   r   rQ   r   rj   rk   r   rP   r   max)r   r   r  r   bn_firstr   	api_firstbest_bn_space	rev_candsrcrc_firstr  best_en_spacefwd	fwd_firstr)   r)   r*   first_name_similarityn  s<   &
r  r   333333?surname_boostsurname_thresholdc                 C   s   t | |}t| |}||kr|n|}||krdnd}d}zt| dd}	d}
|	D ]}t||}||
kr4|}
q'|
}W n tyC   d}Y nw |}|rS||krStd|| }t| |}||||||dS )Nforwardreverser   r   r   g      ?)forward_scorereverse_score
best_scorebest_directionsurname_similarityr  )r   r  r   r	  rQ   minr  )r   r   r  r  r'   r   	base_bestbest_dirsurname_simr  
best_rev_sr  ssimboostedfn_simr)   r)   r*   hybrid_match  s8   



r'  textc                 C   s  t dd| }dd | D }dd t| D }g }t| D ]^}| }| ||d  }| D ]K}| }t	|r~dt
|  krJdkr~n q3|rR|d	 nd }	d }
| td	| d
 | d
  }t	|}|ru|d }
||	|
|d q3q!t|D ]M\}}t 	d|}|r|d}tddD ]6}|| t
|k r|||  }t	|rdt
|  krdkrn q|r|d	 nd }	||	||d  nqqt }g }|D ]#}|d d|d d|d }||vr|| || q|S )Nz\rrW   c                 S   s   g | ]
}|  r|  qS r)   )rP   )r   lnr)   r)   r*   r     s    z/extract_daag_khatian_raiter.<locals>.<listcomp>c                 S   s   g | ]	}| d  qS )r0   )grouprP   )r   r   r)   r)   r*   r     s    rT   r      r      r0   )daagkhatianraiterz!\b([0-9]{2,6}(?:\/[0-9]{1,4})?)\b   x   r-  z::r.  r/  )rj   rk   
splitlinesDAAG_REfinditerRAITER_LABEL_REendrP   
BENGALI_REsearchrO   r
  start
KHATIAN_REr*  rD   r?   ranger   r   r   )r(  	text_normlinesr   resultsr   r9  snippetr)  r-  rr   backkmrH   mnumkidr   nxtr   r   r   keyr)   r)   r*   extract_daag_khatian_raiter  sH   &"
	
&
&rF  c                   @   sF   e Zd ZU ee ed< ee ed< ee ed< ee ed< eed< dS )SimpleCheckRequestr-  r.  farmerfiles	thresholdN)__name__
__module____qualname__r   r   __annotations__floatr)   r)   r)   r*   rG    s   
 rG  z/quick_check.reqc           5         sH  zt | j}W n ty   d}Y nw d}d}d}d}d}tjdd}zzt| j|I d H }W n tyM }	 ztj|dd	 t	d
d|	 dd }	~	ww i }
t
 }t
 }t| j|D ]\}}t|}|rnt| dk rzt|}W n ty   d}Y nw t|}g i i i d}|D ]w}|drt|dnd }|drt|dnd }|dpd}t|}t|}tdd|}|r||d vr|d | || |r|p|pd|d |< |r|n|pd|d |< || |d | r||d |d | < q||
|< q\g }g }| jD ]}t||vr%|t|| jd q| jD ]}t||vr>|t|| jd q*t|
}g }g }i }i } i }!|D ]:}"|"d}|"d}#|"d}$|ro|t|g |" |#r}| t|#g |" |$r|!|$g |" qP| jD ]Y}%g }&| jrt
dd | jD }'|'D ]}(|(|v r|&||(  q|&s| jrt
dd | jD })|)D ]}*|*| v r|&| |*  q|&s| jr| jD ]}+|+|!v r|&|!|+  q|&st|}&dd d d d d ddddd 
},|&D ]R}-|-d!}.|.sqt |%|.||d"}/|/d# }0|0|,d# krW|/d$}1|1d u r4t!|%|.}1|,"|0|.|-d|-d|-d|/d% |/d& |/d' |/d(d|1d 
 q|,d# |kr|,d( |kr|,d$ |kr||%|,d) |,d* |,d+ |,d, |,d# |,d# |,d- |,d& |,d' |,d( |,d$ d. qg }2|,d# |k r|2d/ |,d( |k r|2d0 |,d$ |k r|2d1 ||%|,d# |,d# |,d* |,d) |,d+ |,d, |,d- |,d& |,d' |,d( |,d$ |2d2 qt|t| t| t|t|t|d3}3|3d4 d5kr	d6nd7}4||||3|4|d8W tj|dd	 S tj|dd	 w )9Nr   r   r  g?g?quickcheck_)prefixT)ignore_errorsi  zdownload_error: )status_codedetailrM   rN   )r   r   r   bn_to_khatianr-  r.  r/  ri   r   r   r   rV  )r-  files_checked)r.  rW  r   r   r   c                 S      g | ]}t |qS r)   r   r   r   r)   r)   r*   r   5  r   zquick_check.<locals>.<listcomp>c                 S   rX  r)   rY  rZ  r)   r)   r*   r   <  r   )
r  best_bn	best_file	best_daagbest_khatian	directionr  r  r  r  r   )r  r  r  r  r  r  r  r  r\  r[  r]  r^  r_  )rH  matched_filefound_bengalir   r   r   r  r_  r  r  r  r  best_score_lowsurname_lowfirst_name_low)rH  r  r   ra  r   r   r   r_  r  r  r  r  reason)total_mismatchesdaag_mismatcheskhatian_mismatchesfarmer_mismatchesrf  r   OKzMismatches found)missing_daagmissing_khatianmissing_farmersummarystatusmatches)#rO  rJ  rQ   tempfilemkdtemprL   rI  shutilrmtreer   r   ziprS   rO   rP   r]   rF  r   r   rl   rq   rj   rk   rD   r   r-  r.  r   
setdefaultrH  extendr   r'  r  update)5rP  _BEST_SCORE_MINSURNAME_REQUIREDFIRSTNAME_REQUIREDSURNAME_BOOSTSURNAME_THRESHOLDtempd
downloadedr   r   global_daagsglobal_khatiansr   
local_pathr(  recsr   r   r-  rr   bn_rawr  	bn_joinedrk  rl  daag_qkh_qbn_candidatesrm  matches_foundkh_index
daag_index
file_indexr   dafifarmer_qcandidate_pool	wanted_khwk	wanted_dawdr'   best_overallr  r   rK   r   r&  re  rn  ro  r)   r)   r*   quick_check  sf  


















 	r  ra   )r   )r   )r   )r   r  )Sr8   rj   rs  rq  r:   typingr   r   r   r   	functoolsr   fastapir   r   r	   pydanticr
   r=   r   pdfminer.high_levelr   	pdf2imager   rX   	rapidfuzzr   rapidfuzz.distancer   r   r   rQ   indic_transliterationr   indic_transliteration.sanscriptr   r   r   DOWNLOAD_DIRr9   apprm   r7  Ir3  r:  r5  r<   r+   r   rL   rS   r]   rb   rg   rl   rq   r   r   r   r   intr   r   r   r   r   rO  r   r   r   r   r   r   r   r  r	  r  r'  rF  rG  postr  r)   r)   r)   r*   <module>   s,   


0	&

$#