
    )i                     D    S r SSKrSSKrSSKJr  \R
                  4S jrg)z
Some useful functions for converting and disambiguating between common alternative orthographies (ways of writing) the same text.
    N)	sanscriptc                    U[         R                  :X  Ga  U n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SS	U5      n[        R                  " S
SU5      n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SSU5      n[        R                  " SS U5      n[        R                  " S!S"U5      n[         R                  " X![         R
                  S#9nU$ [        R                  " S$R                  X5      5        [        R                  " SSU 5      $ )%aG  
Given some (devanAgarI?) sanskrit text, this function produces a "key" so
that

1] The key should be the same for different observed orthographical
forms of the same text. For example:

::

    - "dharmma" vs "dharma"
    - "rAmaM gacChati" vs "rAma~N gacChati" vs "rAma~N gacChati"
    - "kurvan eva" vs "kurvanneva"

2] The key should be different for different for different texts.

-  "stamba" vs "stambha"

This function attempts to succeed at [1] and [2] almostall the time.
Longer the text, probability of failing at [2] decreases, while
probability of failing at [1] increases (albeit very slightly).

Sources of orthographically divergent forms:

-  Phonetically sensible grammar rules
-  Neglect of sandhi while writing
-  Punctuation, spaces, avagraha-s.
-  Regional-language-influenced mistakes (La instead of la.)

Some example applications of this function:

-  Create a database of quotes or words with minimal duplication.
-  Search a database of quotes or words while being robust to optional
   forms.

Also see equivalent function in the scala indic-transliteration package.
z\P{IsDevanagari} z\sz\p{P}u   [०-९।॥॰ऽ]|[॑-॔]u   [यरल]्ँu   म्u   [ङञणन]u   मu   ँ|ंu   ॐu	   ओम्u   [ळऴ]u   लu   ([क-हक़-य़])्\1+z\1u   [कग]्ख्u   ख्u   [कग]्घ्u   घ्u   च्छ्u   छ्u   ज्झ्u   झ्u   त्थ्u   थ्u   द्ध्u   ध्u   ड्ढ्u   ढ्u   प्फ्u   फ्u   ब्भ्u   भ्)_from_tozgot script {} for '{}')	r   
DEVANAGARIregexsubtransliterate	OPTITRANSloggingwarningformat)textencoding_schemekeys      c/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/indic_transliteration/deduplication.pyget_approx_deduplicating_keyr      s   J )...ii+R5iir3'ii"c*ii92sCii+Xs;ii(%5ii	8S1ii{C0ii
E3/ ii3UC@ii+Xs;ii+Xs;ii#6ii#6ii#6ii#6ii#6ii#6ii#6%%ciFYFYZ
077NOyyD))    )__doc__r   r	   indic_transliterationr   r   r    r   r   <module>r      s)      + 8A7K7K F*r   