
    k*i&                        S SK Jr  S SKJr  S SKJrJr  S SKJr	J
rJrJr  SSS.S jr SS jrSSS.S	 jrSSS.S
 jr SS jrSSS.S jr SS jrSS.S jr
SS.S jrg)    )annotations)conv_sequences)is_nonesetupPandas)_block_similarityeditopsopcodes
similarityN)	processorscore_cutoffc                   Ub  U" U 5      n U" U5      n[        X5      u  p[        U 5      [        U5      -   n[        X5      nUSU-  -
  nUb  Xc::  a  U$ US-   $ )a]  
Calculates the minimum number of insertions and deletions
required to change one sequence into the other. This is equivalent to the
Levenshtein distance with a substitution weight of 2.

Parameters
----------
s1 : Sequence[Hashable]
    First string to compare.
s2 : Sequence[Hashable]
    Second string to compare.
processor: callable, optional
    Optional callable that is used to preprocess the strings before
    comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
    Maximum distance between s1 and s2, that is
    considered as a result. If the distance is bigger than score_cutoff,
    score_cutoff + 1 is returned instead. Default is None, which deactivates
    this behaviour.

Returns
-------
distance : int
    distance between s1 and s2

Examples
--------
Find the Indel distance between two strings:

>>> from rapidfuzz.distance import Indel
>>> Indel.distance("lewenstein", "levenshtein")
3

Setting a maximum distance allows the implementation to select
a more efficient implementation:

>>> Indel.distance("lewenstein", "levenshtein", score_cutoff=1)
2

      )r   lenlcs_seq_similarity)s1s2r   r   maximumlcs_simdists          [/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/rapidfuzz/distance/Indel_py.pydistancer      sn    ^ r]r]B#FB"gBG (GQ[ D (D,@4W|VWGWW    c                v    [        U5      [        U5      -   n[        XU5      nUSU-  -
  nUb  Xc::  a  U$ US-   $ )Nr   r   )r   lcs_seq_block_similarity)blockr   r   r   r   r   r   s          r   _block_distancer   I   sK     "gBG&u"5GQ[ D (D,@4W|VWGWWr   c                   Ub  U" U 5      n U" U5      n[        X5      u  p[        U 5      [        U5      -   n[        X5      nXE-
  nUb  Xc:  a  U$ S$ )a  
Calculates the Indel similarity in the range [max, 0].

This is calculated as ``(len1 + len2) - distance``.

Parameters
----------
s1 : Sequence[Hashable]
    First string to compare.
s2 : Sequence[Hashable]
    Second string to compare.
processor: callable, optional
    Optional callable that is used to preprocess the strings before
    comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
    Maximum distance between s1 and s2, that is
    considered as a result. If the similarity is smaller than score_cutoff,
    0 is returned instead. Default is None, which deactivates
    this behaviour.

Returns
-------
similarity : int
    similarity between s1 and s2
r   )r   r   r   )r   r   r   r   r   r   sims          r   r
   r
   U   sa    @ r]r]B#FB"gBGBD
.C'3+>3FQFr   c                  [        5         [        U 5      (       d  [        U5      (       a  gUb  U" U 5      n U" U5      n[        X5      u  p[        U 5      [        U5      -   n[	        X5      nU(       a  XT-  OSnUb  Xc::  a  U$ S$ )a  
Calculates a normalized levenshtein similarity in the range [1, 0].

This is calculated as ``distance / (len1 + len2)``.

Parameters
----------
s1 : Sequence[Hashable]
    First string to compare.
s2 : Sequence[Hashable]
    Second string to compare.
processor: callable, optional
    Optional callable that is used to preprocess the strings before
    comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
    Optional argument for a score threshold as a float between 0 and 1.0.
    For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
    which deactivates this behaviour.

Returns
-------
norm_dist : float
    normalized distance between s1 and s2 as a float between 0 and 1.0
      ?r   r   )r   r   r   r   r   )r   r   r   r   r   r   	norm_dists          r   normalized_distancer#      s~    > Mr{{gbkkr]r]B#FB"gBGBD")qI%-1J9RQRRr   c                z    [        U5      [        U5      -   n[        XU5      nU(       a  XT-  OSnUb  Xc::  a  U$ S$ )Nr   r   )r   r   )r   r   r   r   r   r   r"   s          r   _block_normalized_distancer%      sC     "gBG5b)D")qI%-1J9RQRRr   c                   [        5         [        U 5      (       d  [        U5      (       a  gUb  U" U 5      n U" U5      n[        X5      u  p[        X5      nSU-
  nUb  XS:  a  U$ S$ )a	  
Calculates a normalized indel similarity in the range [0, 1].

This is calculated as ``1 - normalized_distance``

Parameters
----------
s1 : Sequence[Hashable]
    First string to compare.
s2 : Sequence[Hashable]
    Second string to compare.
processor: callable, optional
    Optional callable that is used to preprocess the strings before
    comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
    Optional argument for a score threshold as a float between 0 and 1.0.
    For norm_sim < score_cutoff 0 is returned instead. Default is 0,
    which deactivates this behaviour.

Returns
-------
norm_sim : float
    normalized similarity between s1 and s2 as a float between 0 and 1.0

Examples
--------
Find the normalized Indel similarity between two strings:

>>> from rapidfuzz.distance import Indel
>>> Indel.normalized_similarity("lewenstein", "levenshtein")
0.85714285714285

Setting a score_cutoff allows the implementation to select
a more efficient implementation:

>>> Indel.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.9)
0.0

When a different processor is used s1 and s2 do not have to be strings

>>> Indel.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
0.8571428571428572
g        r!   r   )r   r   r   r#   )r   r   r   r   r"   norm_sims         r   normalized_similarityr(      sj    d Mr{{gbkkr]r]B#FB#B+IYH$,0H8PqPr   c                <    [        XU5      nSU-
  nUb  XS:  a  U$ S$ )Nr!   r   )r%   )r   r   r   r   r"   r'   s         r   _block_normalized_similarityr*      s.     +5b9IYH$,0H8PqPr   r   c                   [        XUS9$ )u  
Return Editops describing how to turn s1 into s2.

Parameters
----------
s1 : Sequence[Hashable]
    First string to compare.
s2 : Sequence[Hashable]
    Second string to compare.
processor: callable, optional
    Optional callable that is used to preprocess the strings before
    comparing them. Default is None, which deactivates this behaviour.

Returns
-------
editops : Editops
    edit operations required to turn s1 into s2

Notes
-----
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
described [6]_. It has a time complexity and memory usage of ``O([N/64] * M)``.

References
----------
.. [6] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
       Stringology (2004).

Examples
--------
>>> from rapidfuzz.distance import Indel
>>> for tag, src_pos, dest_pos in Indel.editops("qabxcd", "abycdf"):
...    print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
 delete s1[0] s2[0]
 delete s1[3] s2[2]
 insert s1[4] s2[2]
 insert s1[6] s2[5]
r+   )lcs_seq_editopsr   r   r   s      r   r   r     s    X 2Y77r   c                   [        XUS9$ )u  
Return Opcodes describing how to turn s1 into s2.

Parameters
----------
s1 : Sequence[Hashable]
    First string to compare.
s2 : Sequence[Hashable]
    Second string to compare.
processor: callable, optional
    Optional callable that is used to preprocess the strings before
    comparing them. Default is None, which deactivates this behaviour.

Returns
-------
opcodes : Opcodes
    edit operations required to turn s1 into s2

Notes
-----
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
described [7]_. It has a time complexity and memory usage of ``O([N/64] * M)``.

References
----------
.. [7] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
       Stringology (2004).

Examples
--------
>>> from rapidfuzz.distance import Indel

>>> a = "qabxcd"
>>> b = "abycdf"
>>> for tag, i1, i2, j1, j2 in Indel.opcodes(a, b):
...    print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
 delete a[0:1] (q) b[0:0] ()
  equal a[1:3] (ab) b[0:2] (ab)
 delete a[3:4] (x) b[2:2] ()
 insert a[4:4] () b[2:3] (y)
  equal a[4:6] (cd) b[3:5] (cd)
 insert a[6:6] () b[5:6] (f)
r+   )lcs_seq_opcodesr.   s      r   r	   r	   4  s    d 2Y77r   )N)
__future__r   rapidfuzz._common_pyr   rapidfuzz._utilsr   r   rapidfuzz.distance.LCSseq_pyr   r   r   r-   r	   r0   r
   r   r   r   r#   r%   r(   r*    r   r   <module>r6      s    # / 1  7X| 		X  (G^ +Sd 		S  =QH 	Q 	,8f 	28r   