
    )iO              	       N   S SK r S SKrS SKrS SKrS SKJr  S SKJrJrJ	r	J
r
JrJrJrJrJr  SSKJrJrJrJr  SSKJr  SSKJr  SS	KJr  S
rS
rSrSrSSSSSSSS.r " S S5      r  " S S5      r! " S S5      r"S\S\S\4S jr#\ RH                  " \!RJ                  5      RL                  RO                  5       r(\ RH                  " \"5      RL                  RO                  5       r)S\S\S\ 4S jr*S\S\S\+4S jr,\4S \S!\S\+4S" jjr-\\4S\S#\S$\S\+4S% jjr.S'S\S!\S\4S& jjr/g)(    N)
itemgetter)	AnyDict	GeneratorListMatchOptionalPatternTupleUnion   )T_numT_obj
T_obj_iter
T_obj_list   )cluster_objects)to_list)objects_to_bbox   g      @   ffffifflfiflst)u   ﬀu   ﬃu   ﬄu   ﬁu   ﬂu   ﬆu   ﬅc                       \ rS rSrSrS\\\\\	   4      SS4S jr
   SS\\   S\S	\S
\S\\\4   4
S jjr     SS\\\\   4   S\S\S	\S
\S\S\\\\4      4S jjr SS\S
\S\\\\4      4S jjrSrg)TextMap   z
A TextMap maps each unicode character in the text to an individual `char`
object (or, in the case of layout-implied whitespace, `None`).
tuplesreturnNc                 b    Xl         SR                  [        [        S5      U5      5      U l        g )N r   )r!   joinmapr   	as_stringselfr!   s     U/var/www/html/land-doc-ocr/venv/lib/python3.13/site-packages/pdfplumber/utils/text.py__init__TextMap.__init__#   s"    Z]F!;<    m
main_groupreturn_groupsreturn_charsc                 6   U R                   UR                  U5      UR                  U5       nU VVs/ s H  u  pgUc  M
  UPM     nnn[        U5      u  ppUR	                  U5      U	U
UUS.nU(       a  UR                  5       US'   U(       a  XS'   U$ s  snnf )N)textx0topx1bottomgroupschars)r!   startendr   groupr8   )r)   r.   r/   r0   r1   subsetr3   cr9   r4   r5   r6   r7   results                 r*   match_to_dictTextMap.match_to_dict'   s     QWWZ01553DE$*<FyaF<-e4 GGJ'
  xxzF8#7O# =s   	B Bpatternregexcasec           
        ^ [        U[        5      (       a#  USL a  [        S5      eUSL a  [        S5      eUnOHUSL a  [        R                  " U5      nUSL a  [        R
                  OSn[        R                  " X5      n[        R                  " XpR                  5      n	[        U4S jU	5      n
U
 Vs/ s H  nU R                  UUUTS9PM     sn$ s  snf )NFzACannot pass a compiled search pattern *and* regex=False together.z@Cannot pass a compiled search pattern *and* case=False together.r   c                 T   > [        U R                  T5      R                  5       5      $ N)boolr<   strip)r.   r/   s    r*   <lambda> TextMap.search.<locals>.<lambda>`   s    D)<)B)B)D$Er-   )r0   r1   r/   )
isinstancer
   
ValueErrorreescapeIcompilefinditerr'   filterr@   )r)   rB   rC   rD   r0   r1   r/   compiledflagsgenfilteredr.   s         `     r*   searchTextMap.searchB   s     gw''~ W  u} V  H~))G, EMBDDqEzz'1Hkk(NN3 EsK 
  +)%	   
 	
 
s   6CrI   c                 >    U(       a  SnOSnU R                  USUSS9$ )a6  
`strip` is analogous to Python's `str.strip()` method, and returns
`text` attributes without their surrounding whitespace. Only
relevant when the relevant TextMap is created with `layout` = True

Setting `return_chars` to False will exclude the individual
character objects from the returned text-line dicts.
z *([^\n]+?) *(\n|$)z([^\n]+)r   F)r/   r1   r0   )rX   )r)   rI   r1   pats       r*   extract_text_linesTextMap.extract_text_linesk   s0     (CC{{AL  
 	
r-   )r'   r!   )r   TT)TTTTr   )TT)__name__
__module____qualname____firstlineno____doc__r   r   strr	   r   r+   r   intrH   r   r   r@   r   r
   rX   r\   __static_attributes__ r-   r*   r   r      s#   
=tE#x*>$?@ =T = "!:  	
  
c3h< "!'
sGCL()'
 '
 	'

 '
 '
 '
 
d38n	'
T 8<

04
	d38n	
 
r-   r   c                       \ rS rSrSrS\\\\4      SS4S jr	SSSSS\
\SS\SSS	4S
\S\S\S\S\S\S\S\S\S\S\S\S\S\4S jjrSrg)WordMap   z
A WordMap maps words->chars.
r!   r"   Nc                     Xl         g rG   r!   r(   s     r*   r+   WordMap.__init__   s    r-   Fr   Tlayoutlayout_widthlayout_heightlayout_width_charslayout_height_chars	x_density	y_densityx_shifty_shifty_toleranceuse_text_flow	presortedexpand_ligaturesc           	      v   / n[        U R                  5      (       d  [        U5      $ U(       a  [        O0 nU(       ag  U(       a  U(       a  [	        S5      eO[        [        X&-  5      5      nU(       a  U(       a  [	        S5      eO[        [        X7-  5      5      nS/U-  nO/ nSnU(       d  U(       a  U R                  O[        U R                  S S9nUS   S   nUS   US   -
  n[        [        US	 U
U=(       d    US
95       GHV  u  nnU(       a  US   S   S   UU	-   -
  U-  OSn[        [        US:  5      [        U5      U-
  5      n[        U5       H5  n[        U5      (       a  US   S   S:X  a  UU-  nUR                  S5        M7     UU-  nSnU(       d  U(       a  UO
[        US S9nU H  u  nnU(       a  US   U-
  U-  OSn[        [        SU5      [        U5      U-
  5      nUS/U-  -  nUU-  nU H<  nUR                  US   US   5      n U  H  n!UR                  U!U45        US-  nM     M>     M     U(       d  GMJ  US/UU-
  -  -  nGMY     U(       aD  UUS-   -
  n"[        U"5       H  nUS:  a  UU-  nUR                  S5        M!     US   S:X  a  USS n[        U5      $ )a  
Given a list of (word, chars) tuples (i.e., a WordMap), return a list of
(char-text, char) tuples (i.e., a TextMap) that can be used to mimic the
structural layout of the text on the page(s), using the following approach:

- Sort the words by (doctop, x0) if not already sorted.

- Calculate the initial doctop for the starting page.

- Cluster the words by doctop (taking `y_tolerance` into account), and
  iterate through them.

- For each cluster, calculate the distance between that doctop and the
  initial doctop, in points, minus `y_shift`. Divide that distance by
  `y_density` to calculate the minimum number of newlines that should come
  before this cluster. Append that number of newlines *minus* the number of
  newlines already appended, with a minimum of one.

- Then for each cluster, iterate through each word in it. Divide each
  word's x0, minus `x_shift`, by `x_density` to calculate the minimum
  number of characters that should come before this cluster.  Append that
  number of spaces *minus* the number of characters and spaces already
  appended, with a minimum of one. Then append the word's text.

- At the termination of each line, add more spaces if necessary to
  mimic `layout_width`.

- Finally, add newlines to the end if necessary to mimic to
  `layout_height`.

Note: This approach currently works best for horizontal, left-to-right
text, but will display all words regardless of orientation. There is room
for improvement in better supporting right-to-left text, as well as
vertical text.
z;`layout_width` and `layout_width_chars` cannot both be set.z=`layout_height` and `layout_height_chars` cannot both be set.) Nr   c                 $    [        U S   S   5      $ Nr   doctopfloatxs    r*   rJ   $WordMap.to_textmap.<locals>.<lambda>   s    51h3Hr-   keyr~   r5   c                 $    [        U S   S   5      $ r}   r   r   s    r*   rJ   r      s    %!X/r-   )preserve_order
)r   Nc                 $    [        U S   S   5      $ )Nr   r4   r   r   s    r*   rJ   r     s    eAaDJ.?r-   r4   r   r3   N)lenr!   r   	LIGATURESrM   rd   roundsorted	enumerater   maxrangeappendminget)#r)   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   _textmap
expansions
blank_linenum_newlineswords_sorted_doctop
first_worddoctop_startiwsy_distnum_newlines_prependline_lenline_words_sorted_x0wordr9   x_distnum_spaces_prependr>   lettersletternum_newlines_appends#                                      r*   
to_textmapWordMap.to_textmap   s   f 794;;8$$"2Y
!$U   
 &)|/G)H%I"" $W  !
 '*%0I*J&K#%);;JJ M KK)HI 	 )+A.
!(+j.??#/(9M	
EAr  Aq(#|g'=>)K 
 $'AE
f,	$  /08}}Q4(?
*H- 1
 00LH  B$?@ !  4e?E$t*w.);1%(Q)95=8;S%T"[M,>>>..A(nnQvY&	BG") 4 A #*   4 v[M-?(-JKKc
h "59I"J./q5
*H- 0 ||+#CR=x  r-   rk   )r^   r_   r`   ra   rb   r   r   r   r   r+   DEFAULT_X_DENSITYDEFAULT_Y_DENSITYDEFAULT_Y_TOLERANCErH   r   rd   r   r   re   rf   r-   r*   rh   rh      s    tE%*;$<= $ 
  "##$,,0#!%Y!Y! Y! 	Y!
  Y! !Y! Y! Y! Y! Y! Y! Y! Y! Y! 
Y! Y!r-   rh   c                      \ rS rSr\\SSSSSSS4	S\S\S\S\S	\S
\S\\	\
      S\\\
4   S\4S jjrS\S\4S jrS\S\S\4S jrS\S\\SS4   4S jrS\S\\SS4   4S jrS\S\\\\4   SS4   4S jrS\S\4S jrS\S\4S jrSrg)WordExtractori$  FTNx_tolerancerv   keep_blank_charsrw   horizontal_ltrvertical_ttbextra_attrssplit_at_punctuationry   c
                     Xl         X l        X0l        X@l        XPl        X`l        Uc  / OUU l        USL a  [        R                  O
U=(       d    SU l	        U	(       a  [        U l        g 0 U l        g )NTr$   )r   rv   r   rw   r   r   r   stringpunctuationr   r   r   )
r)   r   rv   r   rw   r   r   r   r   ry   s
             r*   r+   WordExtractor.__init__%  sm     '& 0*,(!,!42+
 $t+ &," 	! (8)Rr-   ordered_charsr"   c           	      8  ^  [        U5      u  p#pEUS   S   US   S   -
  nUS   S   nU(       a  T R                  (       a  OT R                  (       a  SOSnSR                  U 4S jU 5       5      UUUX6-   UUUS	.n	T R                   H  n
US   U
   X'   M     U	$ )
Nr   r~   r5   uprightr   r   r$   c              3   h   >#    U  H'  nTR                   R                  US    US    5      v   M)     g7fr3   N)r   r   ).0r>   r)   s     r*   	<genexpr>,WordExtractor.merge_chars.<locals>.<genexpr>J  s0      CPa##AfIqy99=s   /2)r3   r4   r6   r5   r~   r7   r   	direction)r   r   r   r%   r   )r)   r   r4   r5   r6   r7   
doctop_adjr   r   r   r   s   `          r*   merge_charsWordExtractor.merge_charsB  s    -m<"1%h/-2B52II
"9-18$---d>O>OAVX	 GG CP  &"
 ##C%a(-DI $ r-   	prev_char	curr_charc                    US   (       aV  U R                   nU R                  nUS   nUS   nU R                  (       a  US   nUS   nUS   n	OhUS   * nUS   * nUS   * n	OUU R                  nU R                   nUS   nUS   nU R                  (       a  US   nUS   nUS   n	OUS   * nUS   * nUS   * n	[	        X:  =(       d    XU-   :  =(       d    XeU-   :  5      $ )a  This method takes several factors into account to determine if
`curr_char` represents the beginning of a new word:

- Whether the text is "upright" (i.e., non-rotated)
- Whether the user has specified that horizontal text runs
  left-to-right (default) or right-to-left, as represented by
  self.horizontal_ltr
- Whether the user has specified that vertical text the text runs
  top-to-bottom (default) or bottom-to-top, as represented by
  self.vertical_ttb
- The x0, top, x1, and bottom attributes of prev_char and
  curr_char
- The self.x_tolerance and self.y_tolerance settings. Note: In
  this case, x/y refer to those directions for non-rotated text.
  For vertical text, they are flipped. A more accurate terminology
  might be "*intra*line character distance tolerance" and
  "*inter*line character distance tolerance"

An important note: The *intra*line distance is measured from the
*end* of the previous character to the *beginning* of the current
character, while the *inter*line distance is measured from the
*top* of the previous character to the *top* of the next
character. The reasons for this are partly repository-historical,
and partly logical, as successive text lines' bounding boxes often
overlap slightly (and we don't want that overlap to be interpreted
as the two lines being the same line).

The upright-ness of the character determines the attributes to
compare, while horizontal_ltr/vertical_ttb determine the direction
of the comparison.
r   r5   r4   r6   r7   )r   rv   r   r   rH   )
r)   r   r   r   yaycyaxbxcxs
             r*   char_begins_new_word"WordExtractor.char_begins_new_word[  s'   N Y  A  A5!B5!B""t_t_t_o%o%o%   A  A4B4B  u%x(u%))&&))W !V !V
 	
r-   c              #     ^#    / mS[         [           S[        [        S S 4   4U4S jjnU H  nUS   nU R                  (       d'  UR                  5       (       a  U" S 5       S h  vN   M@  X@R                  ;   a"  U" U5       S h  vN   U" S 5       S h  vN   Mq  T(       a,  U R                  TS   U5      (       a  U" U5       S h  vN   M  TR                  U5        M     T(       a  Tv   g g  N Nm N_ N.7f)Nnew_charr"   c              3   6   >#    T(       a  Tv   U c  / mg U /mg 7frG   rf   )r   current_words    r*   start_next_word:WordExtractor.iter_chars_to_words.<locals>.start_next_word  s!     
 ""!)!12LzLs   r3   r   )	r	   r   r   r   r   isspacer   r   r   )r)   r   r   charr3   r   s        @r*   iter_chars_to_words!WordExtractor.iter_chars_to_words  s      $&	Buo	Bz4-.	B "D<D((T\\^^*4000222*4000*4000$";";L<Ld"S"S*4000 ##D) ""   1 10 1sH   A#C;&C3' C;C5C;C72C;
C9)C;5C;7C;9C;r9   c              #     #    S[         S[        4S jn[        [        U5      US5       H  nUS   S   nU(       a  SOSn[        U[	        U5      U R
                  5      nU Hj  nU(       a  SOSn[        U[	        U5      S9n	U(       a  U R                  (       d'  OU R                  (       d  [        U	5       S h  vN   M`  U	 S h  vN   Ml     M     g  N N7f)	Nr   r"   c                      [        U S   5      * $ Nr   )rd   r   s    r*   upright_key2WordExtractor.iter_sort_chars.<locals>.upright_key  s    )%%%r-   r   r   r~   r4   r   )
r   rd   r   listr   rv   r   r   r   reversed)
r)   r9   r   upright_clusterr   cluster_keysubclustersscsort_keyto_yields
             r*   iter_sort_charsWordExtractor.iter_sort_chars  s     	&5 	&S 	&  /tE{KKO%a(3G&-(4K *K!8$:J:JK "#*4!"*X*>? 07+++D<M<M'111''' "  L  2's$   B9C;C<CCCCc              #     #    U R                   (       a  UOU R                  U5      n[        S/U R                  Q76 n[        R
                  " X#5      nU H2  u  pVU R                  U5       H  nU R                  U5      U4v   M     M4     g 7fr   )rw   r   r   r   	itertoolsgroupbyr   r   )r)   r9   r   grouping_keygrouped_charskeyvals
char_group
word_charss           r*   iter_extract_tuples!WordExtractor.iter_extract_tuples  s      "&!3!39M9Me9T!)?d.>.>?!))-F#0G"66zB
''
3Z@@ C $1s   B
Bc                 H    [        [        U R                  U5      5      5      $ rG   )rh   r   r   r)   r9   s     r*   extract_wordmapWordExtractor.extract_wordmap  s    tD44U;<==r-   c                 D    [        S U R                  U5       5       5      $ )Nc              3   *   #    U  H	  u  pUv   M     g 7frG   rf   )r   r   r   s      r*   r   .WordExtractor.extract_words.<locals>.<genexpr>  s     Q1P-TD1P   )r   r   r   s     r*   extract_wordsWordExtractor.extract_words  s    Q1I1I%1PQQQr-   )	r   r   r   r   r   rw   r   r   rv   )r^   r_   r`   ra   DEFAULT_X_TOLERANCEr   r   rH   r	   r   rc   r   r+   r   r   r   r   r   r   r   r   r   r   rh   r   r   re   rf   r-   r*   r   r   $  si    10!&##!+/16!%@@ @ 	@
 @ @ @ d3i(@ $D#I.@ @:  2I
I
 I
 
	I
V!'!	:tT)	*!F(Z (IeT4>O4P (0
A
A	5
*+T47	8
A>Z >G >R: R* Rr-   r   r9   kwargsr"   c                 6    [        S0 UD6R                  U 5      $ )Nrf   )r   r   )r9   r   s     r*   r   r     s    "6"0077r-   c           
         UR                  SS05        [        S0 [         Vs0 s H  o"U;   d  M
  X!U   _M     snD6nUR                  U 5      nUR                  " S0 [
         Vs0 s H  o"U;   d  M
  X!U   _M     snD6nU$ s  snf s  snf )Nrx   Trf   )updater   WORD_EXTRACTOR_KWARGSr   r   TEXTMAP_KWARGS)r9   r   k	extractorwordmaptextmaps         r*   chars_to_textmapr    s    
MM;%& !6
F!6Av+<1Qi<!6
FI ''.G   !/
?A;<1Qi<
?G N G @s   	B	B%	B2	Bc           
         [        U 5      n [        U 5      S:X  a  gUR                  S5      (       a  [        U 40 UD6R                  $ UR                  S[
        5      n[        S0 [         Vs0 s H  o3U;   d  M
  X1U   _M     snD6nUR                  U 5      n[        U[        S5      U5      nSR                  S U 5       5      $ s  snf )	Nr   r$   rm   rv   r~   r   c              3   R   #    U  H  nS R                  S U 5       5      v   M     g7f)r{   c              3   *   #    U  H	  oS    v   M     g7fr   rf   )r   r   s     r*   r   )extract_text.<locals>.<genexpr>.<genexpr>  s     !@44v,4r   N)r%   )r   lines     r*   r   extract_text.<locals>.<genexpr>  s"     SUT!@4!@@@Us   %'rf   )r   r   r   r  r'   r   r   r  r   r   r   r%   )r9   r   rv   r  r  wordsliness          r*   extract_textr  
  s     ENE
5zQzz(00:::jj0CD! 
%:J%:6k|q)|%:J
	 ''.z(';[IyySUSSS	 Ks   -	C:	C
line_chars	tolerancec                     SnS n[        U [        S5      S9 H"  nUb  US   X1-   :  a  US-  nUS   nX$S   -  nM$     U$ )Nr$   r4   r   r{   r6   r3   )r   r   )r  r  colllast_x1r   s        r*   collate_liner    s]     DGzz$'78d4jG4G&HCKDt*V	 9
 Kr-   r   rv   c                 f   ^ [        U [        S5      U5      nSR                  U4S jU 5       5      $ )Nr~   r   c              3   <   >#    U  H  n[        UT5      v   M     g 7frG   )r  )r   r>   r   s     r*   r   &extract_text_simple.<locals>.<genexpr>2  s     E9a\![119s   )r   r   r%   )r9   r   rv   	clustereds    `  r*   extract_text_simpler  ,  s-    
  z(';[II99E9EEEr-   c                    ^^^ [        SSSS5      m[        SS5      mS[        S[        [        S	S	4   4UUU4S
 jjnU" U 5      n[	        X0R
                  S9$ )u   
Removes duplicate chars — those sharing the same text, fontname, size,
and positioning (within `tolerance`) as other characters in the set.
fontnamesizer   r3   r~   r4   r9   r"   Nc              3     >#    [        U TS9n[        R                  " UTS9 HW  u  p#[        [	        U5      [        S5      T5       H/  n[        U[        S5      T5       H  n[        UTS9S   v   M     M1     MY     g 7f)Nr   r~   r4   r   )r   r   r   r   r   r   )	r9   sorted_charsgrp	grp_chars	y_cluster	x_clusterr   pos_keyr  s	         r*   yield_unique_chars(dedupe_chars.<locals>.yield_unique_chars=  s|     e-'//#FNC,YH!5y	 "1z$/"I !8;;" Gs   A<A?r   )r   r   r   r   r   index)r9   r  r'  dedupedr   r&  s    `  @@r*   dedupe_charsr+  5  s`    
 ZF
;C4(G	<* 	<5$;L1M 	< 	< !'G'{{++r-   )r   )0inspectr   rN   r   operatorr   typingr   r   r   r   r   r	   r
   r   r   _typingr   r   r   r   
clusteringr   genericr   geometryr   r   r   r   r   r   r   rh   r   r   	signaturer   
parameterskeysr  r  r  rc   r  r  r  r+  rf   r-   r*   <module>r6     s     	   U U U : : '  %     	`
 `
Fa! a!HMR MR`8 8s 8z 8 ""7#5#56AAFFH))-8CCHHJ J # ' TTT 	T, + 	  -,FFF F 		F,
 ,u ,Z ,r-   