Fixed ° and the overload ssue

This commit is contained in:
root 2025-04-09 16:49:59 +00:00
parent d51d597e87
commit 322d662011
118 changed files with 36900 additions and 277 deletions

View File

@ -3,3 +3,4 @@ pdfs/
node_modules/
pycache/
cloned_repo/
extracted_texts/

View File

@ -2,7 +2,7 @@ Control Panel,Unit Number,Alias,Equipment Type,Type of Conveyor,Speed,Drive Hand
BULK INBOUND NORTH,BS1-005,PS10-1,Powered-Belted,Level Belt,150,RH,460/3/60,25,5,Y,N,RPH3200BXB-FR,NA,24,12,N,Y,NA,NA,"36"" SG, Bi-directional"
BULK INBOUND NORTH,BS1-010,PS10-2,Powered-Belted,Incline Belt,200,LH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24,,N,Y,NA,NA,"36"" SG"
BULK INBOUND NORTH,BS1-011,PS10-3,Powered-Belted,Incline Belt,240,RH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24.00'',12'',,,,,
BULK INBOUND NORTH,BS1-015-CH,PS10-4CH,90<EFBFBD> Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND NORTH,BS1-015-CH,PS10-4CH,90° Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND NORTH,BS1-020,PS10-5,Powered-Belted,Level Belt,240,LH,460/3/60,25,15,Y,N,RPH3200BXB-FR,NA,24.00'',12'',N,Y,Y,QTY 4,
BULK INBOUND NORTH,BS1-020-CH1,PS10-5CH1,Induct Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND NORTH,BS1-020-CH2,PS10-5CH2,Induct Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
@ -12,17 +12,17 @@ BULK INBOUND NORTH,BS1-020-DIV1,PS10-5DIV1,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,N
BULK INBOUND NORTH,BS1-020-DIV2,PS10-5DIV2,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
BULK INBOUND NORTH,BS1-020-DIV3,PS10-5DIV3,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
BULK INBOUND NORTH,BS1-020-DIV4,PS10-5DIV5,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
BULK INBOUND NORTH,BS1-025-CH,PS10-6CH,90<EFBFBD> Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND NORTH,BS1-025-CH,PS10-6CH,90° Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND NORTH,BS2-005,PS11-1,Powered-Belted,Level Belt,150,RH,460/3/60,25,5,Y,N,RPH3200BXB-FR,NA,24.00'',12'',N,Y,NA,NA,"36"" SG"
BULK INBOUND NORTH,BS2-010,PS11-2,Powered-Belted,Incline Belt,200,RH,460/3/60,25,5,Y,N,APH150MFOXLN,NA,24.00'',12'',Y,Y,NA,NA,"36"" SG"
BULK INBOUND NORTH,BS2-015,PS11-3,Powered-Belted,Level Belt,240,RH,460/3/60,25,10,Y,N,RPH3200BXB-FR,NA,24.00'',12'',Y,Y,NA,NA,
BULK INBOUND NORTH,BS2-020,PS11-4,Powered-Belted,Incline Belt,240,LH,460/3/60,25,10,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,
BULK INBOUND NORTH,BS2-025-CH,PS11-5CH,90<EFBFBD> Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND NORTH,BS2-025-CH,PS11-5CH,90° Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND NORTH,BS2-030,PS11-6,Powered-Belted,Incline Belt,240,RH,460/3/60,25,10,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,"36"" SG"
BULK INBOUND NORTH,BS2-035,PS11-7,Powered-Belted,Incline Belt,240,RH,460/3/60,25,20,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,"36"" SG"
BULK INBOUND NORTH,BS2-040,PS11-8,Powered-Belted,Incline Belt,240,RH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,"36"" SG"
BULK INBOUND NORTH,BS2-045,PS11-9,Powered-Belted,Incline Belt,240,RH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24.00'',12'',N,Y,NA,NA,"36"" SG"
BULK INBOUND NORTH,BS2-050-CH,PS11-10CH,90<EFBFBD> Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND NORTH,BS2-050-CH,PS11-10CH,90° Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND NORTH,BS2-055,PS11-11,Powered-Belted,Level Belt,240,LH,460/3/60,25,15,Y,N,RPH3200BXB-FR,NA,24.00'',12'',N,Y,Y,QTY 6,
BULK INBOUND NORTH,BS2-055-CH1,PS11-11CH1,Induct Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND NORTH,BS2-055-CH2,PS11-11CH2,Induct Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
@ -36,17 +36,17 @@ BULK INBOUND NORTH,BS2-055-DIV3,PS11-11DIV3,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,
BULK INBOUND NORTH,BS2-055-DIV4,PS11-11DIV4,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
BULK INBOUND NORTH,BS2-055-DIV5,PS11-11DIV5,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
BULK INBOUND NORTH,BS2-055-DIV6,PS11-11DIV6,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
BULK INBOUND NORTH,BS2-060-CH,PS1-12CH,90<EFBFBD> Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND NORTH,BS2-060-CH,PS1-12CH,90° Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND SOUTH,BS3-005,PS8-1,Powered-Belted,Level Belt,150,LH,460/3/60,25,7.5,Y,N,RPH3200BXB-FR,NA,24.00'',12'',N,Y,NA,NA,
BULK INBOUND SOUTH,BS3-010,PS8-2,Powered-Belted,Incline Belt,200,LH,460/3/60,25,10,Y,N,RPH3200BXB-FR,NA,24.00'',12'',Y,Y,NA,NA,
BULK INBOUND SOUTH,BS3-015-CH,PS8-3CH,90<EFBFBD> Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND SOUTH,BS3-015-CH,PS8-3CH,90° Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND SOUTH,BS3-020,PS8-4,Powered-Belted,Incline Belt,240,LH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,"36"" SG"
BULK INBOUND SOUTH,BS3-025,PS8-5,Powered-Belted,Incline Belt,240,LH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24.00'',12'',N,Y,NA,NA,"36"" SG"
BULK INBOUND SOUTH,BS3-030,PS8-6,Sorter,Intralox Flowsplitter,240,LH,460/3/60,25,5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
BULK INBOUND SOUTH,BS3-035-CH,PS8-7CH,Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND SOUTH,BS3-040,PS8-8,Powered-Belted,Incline Belt,240,RH,460/3/60,25,7.5,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,
BULK INBOUND SOUTH,BS3-045,PS8-9,Powered-Belted,Incline Belt,240,LH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,
BULK INBOUND SOUTH,BS3-050-CH,PS8-10CH,90<EFBFBD> Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND SOUTH,BS3-050-CH,PS8-10CH,90° Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND SOUTH,BS3-055,PS8-11,Powered-Belted,Level Belt,240,RH,460/3/60,25,10,Y,N,RPH3200BXB-FR,NA,24.00'',12'',N,Y,Y,QTY 4,
BULK INBOUND SOUTH,BS3-055-CH1,PS8-11CH1,Induct Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND SOUTH,BS3-055-CH2,PS8-11CH2,Induct Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
@ -56,7 +56,7 @@ BULK INBOUND SOUTH,BS3-055-DIV1,PS8-11DIV1,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,N
BULK INBOUND SOUTH,BS3-055-DIV2,PS8-11DIV2,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
BULK INBOUND SOUTH,BS3-055-DIV3,PS8-11DIV3,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
BULK INBOUND SOUTH,BS3-055-DIV4,PS8-11DIV4,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
BULK INBOUND SOUTH,BS3-060-CH,PS8-12CH,90<EFBFBD> Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND SOUTH,BS3-060-CH,PS8-12CH,90° Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND SOUTH,BS4-005-CH,PS9-1CH,Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
BULK INBOUND SOUTH,BS4-010,PS9-2,Powered-Belted,Decline Belt,240,RH,460/3/60,25,7.5,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,
BULK INBOUND SOUTH,BS4-015,PS9-3,Powered-Belted,Level Belt,240,RH,460/3/60,25,7.5,Y,N,RPH3200BXB-FR,NA,24.00'',12'',N,Y,Y,QTY 4,

1 Control Panel Unit Number Alias Equipment Type Type of Conveyor Speed Drive Hand PSU Live Load Motor HP VFD Brake Belting Type Roll Center Side Guard 1 Side Guard 2 Netting Btm Guard Diverts Diverts 2 Remarks
2 BULK INBOUND NORTH BS1-005 PS10-1 Powered-Belted Level Belt 150 RH 460/3/60 25 5 Y N RPH3200BXB-FR NA 24 12 N Y NA NA 36" SG, Bi-directional
3 BULK INBOUND NORTH BS1-010 PS10-2 Powered-Belted Incline Belt 200 LH 460/3/60 25 15 Y N APH150MFOXLN NA 24 N Y NA NA 36" SG
4 BULK INBOUND NORTH BS1-011 PS10-3 Powered-Belted Incline Belt 240 RH 460/3/60 25 15 Y N APH150MFOXLN NA 24.00'' 12''
5 BULK INBOUND NORTH BS1-015-CH PS10-4CH 90� Spiral Chute 90° Spiral Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
6 BULK INBOUND NORTH BS1-020 PS10-5 Powered-Belted Level Belt 240 LH 460/3/60 25 15 Y N RPH3200BXB-FR NA 24.00'' 12'' N Y Y QTY 4
7 BULK INBOUND NORTH BS1-020-CH1 PS10-5CH1 Induct Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
8 BULK INBOUND NORTH BS1-020-CH2 PS10-5CH2 Induct Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
12 BULK INBOUND NORTH BS1-020-DIV2 PS10-5DIV2 Sorter Divert Arm NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
13 BULK INBOUND NORTH BS1-020-DIV3 PS10-5DIV3 Sorter Divert Arm NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
14 BULK INBOUND NORTH BS1-020-DIV4 PS10-5DIV5 Sorter Divert Arm NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
15 BULK INBOUND NORTH BS1-025-CH PS10-6CH 90� Straight Chute 90° Straight Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
16 BULK INBOUND NORTH BS2-005 PS11-1 Powered-Belted Level Belt 150 RH 460/3/60 25 5 Y N RPH3200BXB-FR NA 24.00'' 12'' N Y NA NA 36" SG
17 BULK INBOUND NORTH BS2-010 PS11-2 Powered-Belted Incline Belt 200 RH 460/3/60 25 5 Y N APH150MFOXLN NA 24.00'' 12'' Y Y NA NA 36" SG
18 BULK INBOUND NORTH BS2-015 PS11-3 Powered-Belted Level Belt 240 RH 460/3/60 25 10 Y N RPH3200BXB-FR NA 24.00'' 12'' Y Y NA NA
19 BULK INBOUND NORTH BS2-020 PS11-4 Powered-Belted Incline Belt 240 LH 460/3/60 25 10 Y N APH150MFOXLN NA 24.00'' - N Y NA NA
20 BULK INBOUND NORTH BS2-025-CH PS11-5CH 90� Spiral Chute 90° Spiral Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
21 BULK INBOUND NORTH BS2-030 PS11-6 Powered-Belted Incline Belt 240 RH 460/3/60 25 10 Y N APH150MFOXLN NA 24.00'' - N Y NA NA 36" SG
22 BULK INBOUND NORTH BS2-035 PS11-7 Powered-Belted Incline Belt 240 RH 460/3/60 25 20 Y N APH150MFOXLN NA 24.00'' - N Y NA NA 36" SG
23 BULK INBOUND NORTH BS2-040 PS11-8 Powered-Belted Incline Belt 240 RH 460/3/60 25 15 Y N APH150MFOXLN NA 24.00'' - N Y NA NA 36" SG
24 BULK INBOUND NORTH BS2-045 PS11-9 Powered-Belted Incline Belt 240 RH 460/3/60 25 15 Y N APH150MFOXLN NA 24.00'' 12'' N Y NA NA 36" SG
25 BULK INBOUND NORTH BS2-050-CH PS11-10CH 90� Spiral Chute 90° Spiral Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
26 BULK INBOUND NORTH BS2-055 PS11-11 Powered-Belted Level Belt 240 LH 460/3/60 25 15 Y N RPH3200BXB-FR NA 24.00'' 12'' N Y Y QTY 6
27 BULK INBOUND NORTH BS2-055-CH1 PS11-11CH1 Induct Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
28 BULK INBOUND NORTH BS2-055-CH2 PS11-11CH2 Induct Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
36 BULK INBOUND NORTH BS2-055-DIV4 PS11-11DIV4 Sorter Divert Arm NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
37 BULK INBOUND NORTH BS2-055-DIV5 PS11-11DIV5 Sorter Divert Arm NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
38 BULK INBOUND NORTH BS2-055-DIV6 PS11-11DIV6 Sorter Divert Arm NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
39 BULK INBOUND NORTH BS2-060-CH PS1-12CH 90� Straight Chute 90° Straight Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
40 BULK INBOUND SOUTH BS3-005 PS8-1 Powered-Belted Level Belt 150 LH 460/3/60 25 7.5 Y N RPH3200BXB-FR NA 24.00'' 12'' N Y NA NA
41 BULK INBOUND SOUTH BS3-010 PS8-2 Powered-Belted Incline Belt 200 LH 460/3/60 25 10 Y N RPH3200BXB-FR NA 24.00'' 12'' Y Y NA NA
42 BULK INBOUND SOUTH BS3-015-CH PS8-3CH 90� Spiral Chute 90° Spiral Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
43 BULK INBOUND SOUTH BS3-020 PS8-4 Powered-Belted Incline Belt 240 LH 460/3/60 25 15 Y N APH150MFOXLN NA 24.00'' - N Y NA NA 36" SG
44 BULK INBOUND SOUTH BS3-025 PS8-5 Powered-Belted Incline Belt 240 LH 460/3/60 25 15 Y N APH150MFOXLN NA 24.00'' 12'' N Y NA NA 36" SG
45 BULK INBOUND SOUTH BS3-030 PS8-6 Sorter Intralox Flowsplitter 240 LH 460/3/60 25 5 NA NA NA NA NA NA NA NA NA NA
46 BULK INBOUND SOUTH BS3-035-CH PS8-7CH Straight Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
47 BULK INBOUND SOUTH BS3-040 PS8-8 Powered-Belted Incline Belt 240 RH 460/3/60 25 7.5 Y N APH150MFOXLN NA 24.00'' - N Y NA NA
48 BULK INBOUND SOUTH BS3-045 PS8-9 Powered-Belted Incline Belt 240 LH 460/3/60 25 15 Y N APH150MFOXLN NA 24.00'' - N Y NA NA
49 BULK INBOUND SOUTH BS3-050-CH PS8-10CH 90� Spiral Chute 90° Spiral Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
50 BULK INBOUND SOUTH BS3-055 PS8-11 Powered-Belted Level Belt 240 RH 460/3/60 25 10 Y N RPH3200BXB-FR NA 24.00'' 12'' N Y Y QTY 4
51 BULK INBOUND SOUTH BS3-055-CH1 PS8-11CH1 Induct Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
52 BULK INBOUND SOUTH BS3-055-CH2 PS8-11CH2 Induct Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
56 BULK INBOUND SOUTH BS3-055-DIV2 PS8-11DIV2 Sorter Divert Arm NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
57 BULK INBOUND SOUTH BS3-055-DIV3 PS8-11DIV3 Sorter Divert Arm NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
58 BULK INBOUND SOUTH BS3-055-DIV4 PS8-11DIV4 Sorter Divert Arm NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
59 BULK INBOUND SOUTH BS3-060-CH PS8-12CH 90� Straight Chute 90° Straight Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
60 BULK INBOUND SOUTH BS4-005-CH PS9-1CH Straight Chute Chute NA NA NA 25 NA NA NA NA NA 24.00'' 24.00'' NA NA NA NA
61 BULK INBOUND SOUTH BS4-010 PS9-2 Powered-Belted Decline Belt 240 RH 460/3/60 25 7.5 Y N APH150MFOXLN NA 24.00'' - N Y NA NA
62 BULK INBOUND SOUTH BS4-015 PS9-3 Powered-Belted Level Belt 240 RH 460/3/60 25 7.5 Y N RPH3200BXB-FR NA 24.00'' 12'' N Y Y QTY 4

118
app.py
View File

@ -54,7 +54,8 @@ def get_views_dir_path():
def get_text_output_dir_path():
# Construct absolute path based on the script's directory
script_dir = os.path.dirname(os.path.abspath(__file__))
return os.path.join(script_dir, TEXT_OUTPUT_FOLDER)
# Use os.path.join to handle path separators correctly and avoid './'
return os.path.abspath(os.path.join(script_dir, TEXT_OUTPUT_FOLDER))
def normalize(text):
"""Normalize string for comparison: lowercase, treat '-' and '_' the same, remove all whitespace."""
@ -68,12 +69,14 @@ def normalize(text):
def read_manifest(csv_filepath):
"""Reads the manifest CSV into a list of dictionaries."""
manifest_items = []
# Only require Alias and Panel now for basic grouping
required_cols = {CSV_ALIAS_COL, CSV_PANEL_COL}
optional_cols = {CSV_EQ_TYPE_COL, CSV_CONV_TYPE_COL}
try:
with open(csv_filepath, mode='r', newline='', encoding='utf-8') as infile:
# Revert back to 'utf-8-sig' to handle potential BOM from Excel
with open(csv_filepath, mode='r', newline='', encoding='utf-8-sig') as infile:
reader = csv.DictReader(infile)
headers = set(h.strip() for h in reader.fieldnames) # Handle potential whitespace in headers
headers = set(h.strip() for h in reader.fieldnames)
# Check for required columns
missing_required = required_cols - headers
@ -85,12 +88,16 @@ def read_manifest(csv_filepath):
for row in reader:
alias = row.get(CSV_ALIAS_COL, "").strip()
panel = row.get(CSV_PANEL_COL, "").strip()
if alias and panel: # Only add if Alias and Control Panel are present
# unit_number = row.get('Unit Number', "").strip() # No longer needed for filename
# Add if Alias and Control Panel are present (Panel needed for grouping results later)
if alias and panel:
item = {
"alias": alias,
"normalized_alias": normalize(alias),
"control_panel": panel,
"expected_drawing_filename": f"{panel}.txt", # Assuming .txt file matches panel name
# "unit_number": unit_number, # Removed
# "expected_drawing_filename": f"MTN6_SYSDL-{unit_number}.txt", # Removed
# Add optional data if columns exist
"equipment_type": row.get(CSV_EQ_TYPE_COL, "").strip() if CSV_EQ_TYPE_COL in headers else "N/A",
"conveyor_type": row.get(CSV_CONV_TYPE_COL, "").strip() if CSV_CONV_TYPE_COL in headers else "N/A",
@ -99,8 +106,11 @@ def read_manifest(csv_filepath):
"found_drawing": False
}
manifest_items.append(item)
# elif alias and panel: # If Unit Number is missing but others are present # Condition removed
# print(f"Warning: Alias '{alias}' in Panel '{panel}' is missing 'Unit Number' in CSV. Skipping drawing check for this item.")
elif alias and not panel:
print(f"Warning: Alias '{alias}' found in CSV but is missing its '{CSV_PANEL_COL}'. Skipping.")
# Add other specific warnings if needed
except FileNotFoundError:
print(f"Error: Manifest file not found at {csv_filepath}")
@ -154,44 +164,50 @@ def check_scada(manifest_data, views_dir):
def check_drawings(manifest_data, text_output_dir):
"""Checks for aliases in extracted drawing text files, one file per panel."""
"""Checks if aliases from manifest exist in *any* extracted drawing text file."""
if not manifest_data: return
print(f"Starting Drawings check in directory: {text_output_dir}...")
found_count = 0
file_cache = {} # Cache normalized content of processed text files
print(f"Starting Drawings check: Scanning all .txt files in directory: {text_output_dir}...")
for item in manifest_data:
normalized_alias = item['normalized_alias']
txt_filename = item['expected_drawing_filename']
txt_filepath = os.path.join(text_output_dir, txt_filename)
all_normalized_content = "" # Combine all text content here
processed_files = 0
found_files = []
try:
# Check cache first
if txt_filepath in file_cache:
normalized_content = file_cache[txt_filepath]
# Read and cache if not already processed
elif os.path.exists(txt_filepath):
with open(txt_filepath, 'r', encoding='utf-8') as f:
# Step 1: Read and combine content of all .txt files in the directory
for filename in os.listdir(text_output_dir):
if filename.lower().endswith('.txt'):
filepath = os.path.join(text_output_dir, filename)
processed_files += 1
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
normalized_content = normalize(content)
file_cache[txt_filepath] = normalized_content # Cache it
else:
# File doesn't exist, mark as not found in cache to avoid re-checking
file_cache[txt_filepath] = None
# print(f" Info: Expected drawing text file not found: {txt_filepath}")
continue # Cannot find alias if file doesn't exist
# Add a separator to prevent false matches across file boundaries
all_normalized_content += normalize(content) + "\n--file-separator--\n"
found_files.append(filename)
except Exception as e:
print(f" Warning: Could not read or process text file {filepath}: {e}")
# Perform check if file content exists
if normalized_content is not None and normalized_alias in normalized_content:
if not item['found_drawing']: # Avoid double counting if alias appears multiple times in manifest
if processed_files == 0:
print(" Warning: No .txt files found in the directory. Cannot perform drawing check.")
return
else:
print(f" Successfully read and normalized content from {len(found_files)} out of {processed_files} .txt files found.")
# Step 2: Check each manifest alias against the combined content
found_count = 0
for item in manifest_data:
normalized_alias = item['normalized_alias']
if normalized_alias and normalized_alias in all_normalized_content:
item['found_drawing'] = True
found_count += 1
# else: item['found_drawing'] is already False by default
print(f"Drawings check finished. Found {found_count} manifest aliases within the combined text content.")
except FileNotFoundError:
print(f" Error: Drawings text directory not found: {text_output_dir}")
except Exception as e:
print(f" Warning: Could not read or process text file {txt_filepath}: {e}")
file_cache[txt_filepath] = None # Mark as failed in cache
print(f"Drawings check finished. Processed {len(file_cache)} unique text files. Found {found_count} manifest aliases.")
print(f" Error during drawings check: {e}")
def calculate_combined_progress(manifest_data):
@ -323,8 +339,7 @@ def update_progress_data():
status_message = current_status # Update status regardless of calculation success/failure
if new_data_calculated is not None:
progress_data = new_data_calculated
# Signal that an update attempt finished
# Signal that an update attempt finished WITH new data
data_updated_event.set()
data_updated_event.clear()
@ -342,8 +357,6 @@ def check_and_update_repo():
if not repo_existed:
print(f"Cloning repository {REPO_URL} into {repo_path}...")
status_message = f"Cloning repository {REPO_URL}..."
# Signal status change during long operation
data_updated_event.set(); data_updated_event.clear()
git.Repo.clone_from(REPO_URL, repo_path, branch=BRANCH)
repo = git.Repo(repo_path)
last_commit_hash = repo.head.commit.hexsha
@ -368,8 +381,6 @@ def check_and_update_repo():
if current_local_commit != current_remote_commit:
print("New commit detected! Pulling changes...")
status_message = "Pulling updates..."
# Signal status change during potentially long operation
data_updated_event.set(); data_updated_event.clear()
try:
pull_info = origin.pull()
new_commit_hash = repo.head.commit.hexsha
@ -392,9 +403,10 @@ def check_and_update_repo():
# Status will be updated within update_progress_data
update_progress_data()
# If no git update, signal any status change (e.g., "No changes" or error)
else:
data_updated_event.set() # Signal status change event
data_updated_event.clear()
# else: # REMOVED block that signaled event for no changes
# REMOVED: data_updated_event.set() # Signal status change event
# REMOVED: data_updated_event.clear()
# Status message is still updated globally, just won't trigger event
except git.GitCommandError as e:
status_message = f"Git command error: {e}"
@ -407,14 +419,14 @@ def check_and_update_repo():
if last_commit_hash is None: last_commit_hash = repo.head.commit.hexsha
except Exception:
if last_commit_hash is None: last_commit_hash = "Error reading commit"
data_updated_event.set() # Signal error status change
data_updated_event.clear()
# REMOVED: data_updated_event.set() # Signal error status change
# REMOVED: data_updated_event.clear()
except Exception as e:
status_message = f"Error checking repository: {e}"
print(status_message)
if last_commit_hash is None: last_commit_hash = "Error checking repo"
data_updated_event.set() # Signal error status change
data_updated_event.clear()
# REMOVED: data_updated_event.set() # Signal error status change
# REMOVED: data_updated_event.clear()
# Return true if analysis was run (because repo changed), false otherwise
return did_update
@ -439,6 +451,16 @@ def periodic_repo_check():
def index():
return render_template('index.html')
@app.route('/drawings')
def drawings_page():
# Render the main index template which now contains all content
return render_template('index.html')
@app.route('/conflicts')
def conflicts_page():
# Render the main index template which now contains all content
return render_template('index.html')
@app.route('/stream')
def stream():
def event_stream():
@ -479,11 +501,11 @@ def stream():
})
yield f"data: {data_payload}\n\n"
last_sent_hash_to_client = current_global_hash # Update the hash sent to this client
else:
# else: # No need for the else block logging here anymore, as the event shouldn't trigger if hash is same
# If hash is the same, maybe only the status message changed (e.g., error occurred)
# Option: Send update only if status is different from last sent status?
# For simplicity now, we only send if hash differs. Client UI shows last known status.
print(f"Data updated event triggered, but hash {current_global_hash} unchanged for this client. Status: '{current_global_status}'")
# print(f"Data updated event triggered, but hash {current_global_hash} unchanged for this client. Status: '{current_global_status}'") # Removed log
return Response(event_stream(), mimetype="text/event-stream")

@ -1 +1 @@
Subproject commit c8aa36809970e0557f46ee80b7f7cf3735efb487
Subproject commit 456de12cca56c09bc1881660b163ac3b5dff593a

View File

@ -3,7 +3,7 @@
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Ignition SCADA & Drawing Progress Monitor</title>
<title>SCADA Progress Monitor</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<style>
@ -37,7 +37,7 @@
margin: 0 auto; /* Center the canvas */
cursor: pointer; /* Indicate clickable */
}
#panels-progress {
#scada-panels-progress, #drawing-panels-progress {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); /* Responsive grid */
gap: 20px;
@ -47,27 +47,68 @@
.modal-body th { background-color: #f8f9fa; text-align: left; }
.status-yes { color: green; font-weight: bold; }
.status-no { color: red; font-weight: bold; }
nav { margin-bottom: 20px; } /* Added for nav spacing */
</style>
</head>
<body>
<div class="container">
<h1 class="mb-4">SCADA & Drawing Device Placement Progress</h1>
<!-- Added Navigation -->
<nav class="nav nav-pills">
<a class="nav-link active" aria-current="page" href="/">SCADA Progress</a>
<a class="nav-link" href="/drawings">Drawing Progress</a>
<a class="nav-link" href="/conflicts">Conflicts</a>
</nav>
<div id="overall-progress" class="chart-container">
<span class="chart-label">Overall Progress</span>
<canvas id="overall-chart-canvas" class="panel-chart-canvas" style="max-width: 200px; max-height: 200px;"></canvas>
<div id="overall-text" style="font-weight: bold; margin-top: 10px;">Found Both: 0/0 (0%)</div>
<!-- SCADA Content Section -->
<div id="scada-content">
<h1 class="mb-4">SCADA Device Placement Progress</h1>
<p>Compares the Equipment Manifest against the SCADA view.json files.</p>
<div id="overall-scada-progress" class="chart-container">
<span class="chart-label">Overall SCADA Progress</span>
<canvas id="overall-scada-chart-canvas" class="panel-chart-canvas" style="max-width: 200px; max-height: 200px;"></canvas>
<div id="overall-scada-text" style="font-weight: bold; margin-top: 10px;">Found in SCADA: 0/0 (0%)</div>
</div>
<hr>
<h2>Progress by Control Panel</h2>
<div id="panels-progress">
<!-- Charts will be loaded here -->
<h2>SCADA Progress by Control Panel</h2>
<div id="scada-panels-progress">
<p>Loading panel data...</p>
</div>
</div>
<!-- Drawing Content Section (Initially Hidden) -->
<div id="drawings-content" style="display: none;">
<h1 class="mb-4">Drawing Device Placement Progress</h1>
<p>Compares the Equipment Manifest against the extracted text from drawing files (.txt).</p>
<div id="overall-drawing-progress" class="chart-container">
<span class="chart-label">Overall Drawing Progress</span>
<canvas id="overall-drawing-chart-canvas" class="panel-chart-canvas" style="max-width: 200px; max-height: 200px;"></canvas>
<div id="overall-drawing-text" style="font-weight: bold; margin-top: 10px;">Found in Drawing: 0/0 (0%)</div>
</div>
<hr>
<h2>Drawing Progress by Control Panel</h2>
<div id="drawing-panels-progress">
<p>Loading panel data...</p>
</div>
</div>
<!-- Conflicts Content Section (Initially Hidden) -->
<div id="conflicts-content" style="display: none;">
<h1 class="mb-4">SCADA/Drawing Conflicts <span id="conflict-count" class="badge bg-warning ms-2">0</span></h1>
<p>Items found in SCADA views but <strong>not</strong> found in the extracted drawing text files.</p>
<div id="panels-conflicts">
<p>Loading conflict data...</p>
</div>
</div>
</div>
<!-- Status Bar -->
<div class="status-bar">
<span id="status-message">Initializing...</span> | Last Commit: <span id="last-commit">N/A</span>
@ -89,7 +130,6 @@
<th>Panel</th>
<th>SCADA Status</th>
<th>Drawing Status</th>
<th>Expected Drawing File</th>
<th>Equipment Type</th>
<th>Type of Conveyor</th>
</tr>
@ -108,180 +148,332 @@
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
<script>
let chartInstances = {};
let progressDetailsData = {};
// --- Global State Variables ---
let chartInstancesScada = {}; // Separate instances for SCADA
let chartInstancesDrawing = {}; // Separate instances for Drawing
let progressDetailsData = {}; // Stores the raw data from SSE (shared)
let previousCommitHash = null; // Single hash for the whole page
let detailsModalInstance = null;
let currentVisibleSection = 'scada'; // Track visible section: 'scada', 'drawing', 'conflicts'
// Define labels and colors consistently
const chartLabels = ['Found Both', 'SCADA Only', 'Drawing Only', 'Missing Both'];
const chartColors = [
'rgb(25, 135, 84)', // Green (Found Both)
'rgb(13, 202, 240)', // Cyan (SCADA Only)
'rgb(255, 193, 7)', // Yellow (Drawing Only)
'rgb(220, 53, 69)' // Red (Missing Both)
];
const listKeys = ['found_both_list', 'found_scada_only_list', 'found_drawing_only_list', 'missing_list'];
// --- Chart Configurations ---
const scadaChartLabels = ['Found in SCADA', 'Not Found in SCADA'];
const scadaChartColors = ['rgb(13, 110, 253)', 'rgb(220, 53, 69)'];
const drawingChartLabels = ['Found in Drawing', 'Not Found in Drawing'];
const drawingChartColors = ['rgb(25, 135, 84)', 'rgb(220, 53, 69)'];
// --- Chart Click Handler (Updated) ---
function handleChartClick(event, elements, chart) {
if (elements.length > 0) {
const clickedElementIndex = elements[0].index;
const isOverallChart = chart.canvas.id === 'overall-chart-canvas';
const identifier = isOverallChart ? '__overall__' : chart.canvas.id.replace('chart-', '');
// Map clicked index to the correct list type/key
if (clickedElementIndex >= 0 && clickedElementIndex < listKeys.length) {
const listType = listKeys[clickedElementIndex];
showDetailsModal(identifier, listType);
} else {
console.warn("Clicked unknown chart segment index:", clickedElementIndex);
}
}
}
// --- UI Update Function (Heavily Updated) ---
function updateUI(data) {
console.log("Updating UI with data:", data);
progressDetailsData = data.progress;
// Update status bar
document.getElementById('status-message').textContent = data.status;
document.getElementById('last-commit').textContent = data.last_commit || 'N/A';
// --- Update Overall Chart & Text ---
const overallData = progressDetailsData.overall;
const overallTotal = overallData.total_csv;
const overallChartCounts = [
overallData.found_both,
overallData.found_scada_only,
overallData.found_drawing_only,
overallData.missing_both
];
// Update text (showing found both %)
document.getElementById('overall-text').textContent = `Found Both: ${overallData.found_both}/${overallTotal} (${overallData.percentage_found_both}%)`;
const overallChartConfig = {
type: 'pie',
data: {
labels: chartLabels,
datasets: [{
label: 'Overall Aliases',
data: overallChartCounts,
backgroundColor: chartColors,
hoverOffset: 4
}]
},
options: {
responsive: true,
maintainAspectRatio: false,
onClick: handleChartClick,
plugins: {
legend: { display: false },
tooltip: {
callbacks: {
label: function(context) {
let label = context.label || '';
if (label) label += ': ';
const value = context.parsed;
if (value !== null) label += value;
if (overallTotal > 0) {
label += ` (${((value / overallTotal) * 100).toFixed(1)}%)`;
}
return label;
}
}
}
}
}
// Map backend list keys for modal clicks (can be combined or kept separate if needed)
const scadaListKeysMap = {
found: ['found_both_list', 'found_scada_only_list'],
notFound: ['found_drawing_only_list', 'missing_list']
};
const drawingListKeysMap = {
found: ['found_both_list', 'found_drawing_only_list'],
notFound: ['found_scada_only_list', 'missing_list']
};
const overallCanvas = document.getElementById('overall-chart-canvas');
if (chartInstances['overall']) {
chartInstances['overall'].data = overallChartConfig.data;
chartInstances['overall'].update();
} else if (overallCanvas) {
const ctxOverall = overallCanvas.getContext('2d');
chartInstances['overall'] = new Chart(ctxOverall, overallChartConfig);
// --- Debounce Utility (Only need one) ---
function debounce(func, wait) {
let timeout;
return function executedFunction(...args) {
const later = () => {
clearTimeout(timeout);
func(...args);
};
clearTimeout(timeout);
timeout = setTimeout(later, wait);
};
}
// --- Update Panel Charts ---
const panelsContainer = document.getElementById('panels-progress');
const panelsData = progressDetailsData.panels;
const sortedPanels = Object.keys(panelsData).sort();
const currentPanelsOnPage = new Set(Object.keys(chartInstances).filter(k => k !== 'overall'));
const incomingPanels = new Set(sortedPanels);
// --- Chart Click Handler (Needs context: SCADA or Drawing?) ---
function handleChartClick(event, elements, chart, context) { // Added context
if (elements.length > 0) {
const clickedElementIndex = elements[0].index;
const isOverallChart = chart.canvas.id.startsWith('overall-'); // More robust check
const identifier = isOverallChart ? '__overall__' : chart.canvas.id.replace(`chart-${context}-`, ''); // Use context
const categoryType = clickedElementIndex === 0 ? 'found' : 'notFound';
// Remove charts for panels no longer present
currentPanelsOnPage.forEach(panelName => {
if (!incomingPanels.has(panelName)) {
if(chartInstances[panelName]) { chartInstances[panelName].destroy(); delete chartInstances[panelName]; }
const chartElement = document.getElementById(`chart-container-${panelName}`);
if (chartElement) chartElement.remove();
showDetailsModal(identifier, categoryType, context); // Pass context to modal
}
}
});
// Update or create charts for current panels
if (sortedPanels.length === 0) {
panelsContainer.innerHTML = '<p>No panel data available yet.</p>';
// --- Core UI Update Functions (One for each section) ---
function updateUIScadaCore(data) {
console.log("Running core SCADA UI redraw logic for commit:", data.last_commit);
progressDetailsData = data.progress; // Update shared raw data
// --- Overall SCADA Chart ---
const overallData = progressDetailsData.overall;
const overallTotal = overallData.total_csv;
const overallFoundScada = overallData.found_both + overallData.found_scada_only;
const overallNotFoundScada = overallData.found_drawing_only + overallData.missing_both;
const overallPercentageFound = overallTotal > 0 ? ((overallFoundScada / overallTotal) * 100).toFixed(1) : 0;
const overallChartCounts = [overallFoundScada, overallNotFoundScada];
document.getElementById('overall-scada-text').textContent = `Found in SCADA: ${overallFoundScada}/${overallTotal} (${overallPercentageFound}%)`;
// --- Only update/create chart if section is visible ---
const isSectionVisible = (currentVisibleSection === 'scada');
if (isSectionVisible) {
const overallScadaCanvas = document.getElementById('overall-scada-chart-canvas');
if (chartInstancesScada['overall']) {
if (JSON.stringify(chartInstancesScada['overall'].data.datasets[0].data) !== JSON.stringify(overallChartCounts)) {
chartInstancesScada['overall'].data.datasets[0].data = overallChartCounts;
chartInstancesScada['overall'].update('none');
}
} else if (overallScadaCanvas) {
console.log("Creating overall SCADA chart (visible).");
const ctxOverall = overallScadaCanvas.getContext('2d');
chartInstancesScada['overall'] = new Chart(ctxOverall, createChartConfig(overallChartCounts, overallTotal, 'scada', 'overall'));
}
} else {
// Remove loading message if it exists
const loadingMsg = panelsContainer.querySelector('p');
if (loadingMsg && loadingMsg.textContent.includes('Loading')) { loadingMsg.remove(); }
// If section is not visible, destroy the chart instance if it exists
if (chartInstancesScada['overall']) {
console.log("Destroying hidden overall SCADA chart.");
chartInstancesScada['overall'].destroy();
delete chartInstancesScada['overall'];
}
}
// --- SCADA Panel Charts ---
const panelsContainer = document.getElementById('scada-panels-progress');
const panelsData = progressDetailsData.panels || {};
updatePanelCharts(panelsContainer, panelsData, chartInstancesScada, 'scada');
console.log("Finished SCADA UI core redraw.");
}
function updateUIDrawingCore(data) {
console.log("Running core Drawing UI redraw logic for commit:", data.last_commit);
progressDetailsData = data.progress; // Update shared raw data
// --- Overall Drawing Chart ---
const overallData = progressDetailsData.overall;
const overallTotal = overallData.total_csv;
const overallFoundDrawing = overallData.found_both + overallData.found_drawing_only;
const overallNotFoundDrawing = overallData.found_scada_only + overallData.missing_both;
const overallPercentageFound = overallTotal > 0 ? ((overallFoundDrawing / overallTotal) * 100).toFixed(1) : 0;
const overallChartCounts = [overallFoundDrawing, overallNotFoundDrawing];
document.getElementById('overall-drawing-text').textContent = `Found in Drawing: ${overallFoundDrawing}/${overallTotal} (${overallPercentageFound}%)`;
// --- Only update/create chart if section is visible ---
const isSectionVisible = (currentVisibleSection === 'drawings');
if (isSectionVisible) {
const overallDrawingCanvas = document.getElementById('overall-drawing-chart-canvas');
if (chartInstancesDrawing['overall']) {
if (JSON.stringify(chartInstancesDrawing['overall'].data.datasets[0].data) !== JSON.stringify(overallChartCounts)) {
chartInstancesDrawing['overall'].data.datasets[0].data = overallChartCounts;
chartInstancesDrawing['overall'].update('none');
}
} else if (overallDrawingCanvas) {
console.log("Creating overall drawing chart (visible).");
const ctxOverall = overallDrawingCanvas.getContext('2d');
chartInstancesDrawing['overall'] = new Chart(ctxOverall, createChartConfig(overallChartCounts, overallTotal, 'drawing', 'overall'));
}
} else {
// If section is not visible, destroy the chart instance if it exists
if (chartInstancesDrawing['overall']) {
console.log("Destroying hidden overall Drawing chart.");
chartInstancesDrawing['overall'].destroy();
delete chartInstancesDrawing['overall'];
}
}
// --- Drawing Panel Charts (call updatePanelCharts, which also checks visibility/destroys) ---
const panelsContainer = document.getElementById('drawing-panels-progress');
const panelsData = progressDetailsData.panels || {};
console.log(`[updateUIDrawingCore] Found drawing panels container:`, panelsContainer ? panelsContainer.id : 'Not Found'); // Added Log
updatePanelCharts(panelsContainer, panelsData, chartInstancesDrawing, 'drawings'); // Changed context to plural 'drawings'
console.log("Finished Drawing UI core redraw.");
}
function updateUIConflictsCore(data) {
console.log("Running core Conflicts UI redraw logic for commit:", data.last_commit);
progressDetailsData = data.progress; // Update shared raw data
const panelsContainer = document.getElementById('panels-conflicts');
panelsContainer.innerHTML = ''; // Clear previous
const panelsData = progressDetailsData.panels;
let totalConflicts = 0;
let panelsWithConflicts = 0;
if (!panelsData || Object.keys(panelsData).length === 0) {
panelsContainer.innerHTML = '<p class="text-center fst-italic">No panel data available yet.</p>';
} else {
const sortedPanels = Object.keys(panelsData).sort();
sortedPanels.forEach(panelName => {
const panel = panelsData[panelName];
const conflictsList = panel.found_scada_only_list || [];
if (conflictsList.length > 0) {
panelsWithConflicts++;
totalConflicts += conflictsList.length;
// ... (Create header and table as in conflicts.html) ...
const panelHeader = document.createElement('h4');
panelHeader.className = 'mt-4 mb-2';
panelHeader.textContent = `${panelName} (${conflictsList.length} conflicts)`;
panelsContainer.appendChild(panelHeader);
const table = document.createElement('table');
table.className = 'table table-sm table-striped table-hover table-bordered';
const thead = table.createTHead();
thead.innerHTML = `<tr><th>Alias</th><th>Panel</th><th>SCADA Status</th><th>Drawing Status</th><th>Equipment Type</th><th>Type of Conveyor</th></tr>`;
const tbody = table.createTBody();
conflictsList.sort((a, b) => a.alias.localeCompare(b.alias)).forEach(item => {
const row = tbody.insertRow();
row.classList.add('table-warning');
row.insertCell().textContent = item.alias;
row.insertCell().textContent = item.control_panel;
row.insertCell().innerHTML = '<span class="status-yes">Yes</span>';
row.insertCell().innerHTML = '<span class="status-no">No</span>';
row.insertCell().textContent = item.equipment_type || 'N/A';
row.insertCell().textContent = item.conveyor_type || 'N/A';
});
panelsContainer.appendChild(table);
}
});
if (panelsWithConflicts === 0) {
panelsContainer.innerHTML = '<p class="text-center fst-italic">No conflicts found across all panels.</p>';
}
}
// Update total count badge
const countBadge = document.getElementById('conflict-count');
if (countBadge) {
countBadge.textContent = totalConflicts;
countBadge.style.display = totalConflicts > 0 ? 'inline-block' : 'none';
}
console.log("Finished Conflicts UI core redraw.");
}
// --- Generic Panel Chart Update Logic ---
function updatePanelCharts(panelsContainer, panelsData, chartInstances, context) { // context: 'scada' or 'drawing'
const incomingPanelNames = new Set(Object.keys(panelsData).sort());
const existingInstanceNames = new Set(Object.keys(chartInstances).filter(k => k !== 'overall'));
// --- Check if the context matches the currently visible section ---
const isSectionVisible = (context === currentVisibleSection);
if (!isSectionVisible) {
// If section is not visible, destroy existing panel chart instances for this context
console.log(`Destroying hidden panel charts for context: ${context}`);
existingInstanceNames.forEach(panelName => {
if (chartInstances[panelName]) {
chartInstances[panelName].destroy();
delete chartInstances[panelName];
}
});
// Don't proceed further if the section is hidden
return;
}
if (incomingPanelNames.size > 0) {
const loadingMsg = panelsContainer.querySelector('p');
if (loadingMsg) { loadingMsg.remove(); }
incomingPanelNames.forEach(panelName => {
const panel = panelsData[panelName];
const panelTotal = panel.total;
const panelChartCounts = [
panel.found_both,
panel.found_scada_only,
panel.found_drawing_only,
panel.missing_both
];
let panelChartCounts;
if (context === 'scada') {
panelChartCounts = [panel.found_both + panel.found_scada_only, panel.found_drawing_only + panel.missing_both];
} else { // drawing
panelChartCounts = [panel.found_both + panel.found_drawing_only, panel.found_scada_only + panel.missing_both];
}
let chartContainer = document.getElementById(`chart-container-${panelName}`);
let canvas = document.getElementById(`chart-${panelName}`);
// Create container and canvas if they don't exist
if (!chartContainer) {
chartContainer = document.createElement('div');
chartContainer.id = `chart-container-${panelName}`;
// --- Only update/create chart if section is visible ---
if (isSectionVisible) {
if (chartInstances[panelName]) {
if (JSON.stringify(chartInstances[panelName].data.datasets[0].data) !== JSON.stringify(panelChartCounts)) {
chartInstances[panelName].data.datasets[0].data = panelChartCounts;
chartInstances[panelName].update('none');
}
} else {
let canvas = document.getElementById(`chart-${context}-${panelName}`); // Use context in ID
if (canvas) {
console.log(`Recreating ${context} chart instance for panel (visible): ${panelName}`);
const ctx = canvas.getContext('2d');
chartInstances[panelName] = new Chart(ctx, createChartConfig(panelChartCounts, panelTotal, context, panelName));
} else {
console.log(`Creating new ${context} panel elements and chart (visible) for: ${panelName}`);
const chartContainer = document.createElement('div');
chartContainer.id = `chart-container-${context}-${panelName}`; // Use context in ID
chartContainer.className = 'chart-container';
const label = document.createElement('span');
label.className = 'chart-label'; label.textContent = panelName;
canvas = document.createElement('canvas');
canvas.id = `chart-${panelName}`;
canvas = document.createElement('canvas'); // Reassign canvas variable
canvas.id = `chart-${context}-${panelName}`; // Use context in ID
canvas.className = 'panel-chart-canvas';
chartContainer.appendChild(label);
chartContainer.appendChild(canvas);
panelsContainer.appendChild(chartContainer);
// Added Log before append
console.log(`[updatePanelCharts] Appending chartContainer (${chartContainer.id}) to panelsContainer (${panelsContainer ? panelsContainer.id : 'null'})`);
panelsContainer.appendChild(chartContainer); // Append to the main panels progress div
const ctx = canvas.getContext('2d');
chartInstances[panelName] = new Chart(ctx, createChartConfig(panelChartCounts, panelTotal, context, panelName));
}
}
}
// --- End visibility check ---
});
} else {
if (!panelsContainer.querySelector('p')) {
panelsContainer.innerHTML = '<p class="text-center fst-italic">No panel data available yet.</p>';
}
}
const panelChartConfig = {
existingInstanceNames.forEach(panelName => {
if (!incomingPanelNames.has(panelName)) {
console.log(`Removing ${context} panel elements and chart for: ${panelName}`);
// Ensure chart is destroyed before removing element
if (chartInstances[panelName]) {
chartInstances[panelName].destroy();
delete chartInstances[panelName];
}
const chartElement = document.getElementById(`chart-container-${context}-${panelName}`); // Use context
if (chartElement) {
chartElement.remove();
}
}
});
}
// --- Generic Helper to create chart config --- Needs context ---
function createChartConfig(chartCounts, total, context, identifier) { // identifier is 'overall' or panelName
const labels = context === 'scada' ? scadaChartLabels : drawingChartLabels;
const colors = context === 'scada' ? scadaChartColors : drawingChartColors;
const datasetLabel = context === 'scada' ? 'SCADA Match' : 'Drawing Match';
return {
type: 'pie',
data: {
labels: chartLabels,
labels: labels,
datasets: [{
label: 'Aliases',
data: panelChartCounts,
backgroundColor: chartColors,
label: datasetLabel,
data: chartCounts,
backgroundColor: colors,
hoverOffset: 4
}]
},
options: {
responsive: true,
maintainAspectRatio: false,
onClick: handleChartClick,
onClick: (event, elements, chart) => handleChartClick(event, elements, chart, context), // Pass context
plugins: {
legend: { display: false },
tooltip: {
callbacks: {
label: function(context) {
let label = context.label || '';
label: function(ctxTooltip) {
let label = ctxTooltip.label || '';
if (label) label += ': ';
const value = context.parsed;
const value = ctxTooltip.parsed;
if (value !== null) label += value;
if (panelTotal > 0) {
label += ` (${((value / panelTotal) * 100).toFixed(1)}%)`;
// Use overallTotal for overall chart, panelTotal otherwise (How to get panelTotal here? Needs rethinking)
// Workaround: Don't show percentage on panel tooltips for now
const chartTotal = (identifier === 'overall' && progressDetailsData.overall) ? progressDetailsData.overall.total_csv : null;
if (chartTotal && chartTotal > 0) {
label += ` (${((value / chartTotal) * 100).toFixed(1)}%)`;
}
return label;
}
@ -290,96 +482,168 @@
}
}
};
// Update existing chart or create new one
if (chartInstances[panelName]) {
chartInstances[panelName].data = panelChartConfig.data;
chartInstances[panelName].update();
} else if (canvas) {
const ctx = canvas.getContext('2d');
chartInstances[panelName] = new Chart(ctx, panelChartConfig);
}
});
// --- Wrapper function called by debouncer (Handles all sections) ---
function processUpdate(data) {
console.log("Processing update for commit:", data.last_commit);
// Always update status bar and commit hash text immediately
document.getElementById('status-message').textContent = data.status;
document.getElementById('last-commit').textContent = data.last_commit || 'N/A';
// *** Strict Check: Only proceed if commit hash has changed ***
if (data.last_commit && data.last_commit !== previousCommitHash) {
console.log("Commit hash changed (" + (previousCommitHash || 'None') + " -> " + data.last_commit + ") or initial load. Queueing core redraw.");
previousCommitHash = data.last_commit;
// Defer the core UI update calls
setTimeout(() => {
// Update all sections - they have internal checks/efficiency
updateUIScadaCore(data);
updateUIDrawingCore(data);
updateUIConflictsCore(data);
}, 0);
} else {
console.log("Commit hash unchanged (" + previousCommitHash + "), skipping core UI redraw.");
}
}
// --- Modal Display Function (Heavily Updated) ---
function showDetailsModal(identifier, listKey) {
// --- Debounced version of the processing function ---
const debouncedProcessUpdate = debounce(processUpdate, 250); // Single debouncer
// --- Modal Display Function (Needs context) ---
function showDetailsModal(identifier, categoryType, context) { // Added context
let sourceData = null;
let panelNameDisplay = ""; // Name to show in the title
const listTypeLabel = chartLabels[listKeys.indexOf(listKey)] || "Details"; // Get nice label
let panelNameDisplay = "";
const listKeysMap = context === 'scada' ? scadaListKeysMap : drawingListKeysMap;
const listTypeLabel = categoryType === 'found'
? (context === 'scada' ? 'Found in SCADA' : 'Found in Drawing')
: (context === 'scada' ? 'Not Found in SCADA' : 'Not Found in Drawing');
if (identifier === '__overall__') {
sourceData = progressDetailsData.overall;
panelNameDisplay = "Overall";
} else {
sourceData = progressDetailsData.panels[identifier];
panelNameDisplay = identifier; // Use panel name from identifier
sourceData = progressDetailsData.panels ? progressDetailsData.panels[identifier] : null;
panelNameDisplay = identifier;
}
if (!sourceData || !sourceData[listKey]) {
console.error("Data list not found for:", identifier, listKey);
alert(`Could not find data for ${listTypeLabel} in ${panelNameDisplay}.`);
return;
}
if (!sourceData) { /* ... error handling ... */ return; }
const dataList = sourceData[listKey];
const backendListKeys = listKeysMap[categoryType];
if (!backendListKeys) { /* ... error handling ... */ return; }
if (!dataList || dataList.length === 0) {
console.log(`No items to show for:`, panelNameDisplay, listKey);
alert(`No ${listTypeLabel} items found for ${panelNameDisplay}.`);
return;
let combinedDataList = [];
backendListKeys.forEach(key => {
if (sourceData[key]) {
combinedDataList = combinedDataList.concat(sourceData[key]);
}
});
if (combinedDataList.length === 0) { /* ... alert handling ... */ return; }
const modalTitleElement = document.getElementById('detailsModalLabel');
const modalTableBody = document.querySelector('#detailsModal .modal-body tbody');
// Update modal title dynamically
modalTitleElement.innerHTML = `${listTypeLabel} Items for ${panelNameDisplay} <span class="badge bg-secondary ms-2">${dataList.length}</span>`;
modalTitleElement.innerHTML = `${listTypeLabel} Items for ${panelNameDisplay} <span class="badge bg-secondary ms-2">${combinedDataList.length}</span>`;
modalTableBody.innerHTML = '';
modalTableBody.innerHTML = ''; // Clear previous entries
// Populate table rows with detailed info
dataList.forEach(item => {
combinedDataList.sort((a, b) => a.alias.localeCompare(b.alias)).forEach(item => {
const row = document.createElement('tr');
row.insertCell().textContent = item.alias;
row.insertCell().textContent = item.control_panel;
// SCADA Status Cell
const scadaCell = row.insertCell();
scadaCell.innerHTML = item.found_scada
? '<span class="status-yes">Yes</span>'
: '<span class="status-no">No</span>';
// Drawing Status Cell
const drawingCell = row.insertCell();
drawingCell.innerHTML = item.found_drawing
? '<span class="status-yes">Yes</span>'
: '<span class="status-no">No</span>';
row.insertCell().textContent = item.expected_drawing_filename || 'N/A';
const scadaCell = row.insertCell(); scadaCell.innerHTML = item.found_scada ? '<span class="status-yes">Yes</span>' : '<span class="status-no">No</span>';
const drawingCell = row.insertCell(); drawingCell.innerHTML = item.found_drawing ? '<span class="status-yes">Yes</span>' : '<span class="status-no">No</span>';
row.insertCell().textContent = item.equipment_type || 'N/A';
row.insertCell().textContent = item.conveyor_type || 'N/A';
if (item.found_scada && !item.found_drawing) { row.classList.add('table-warning'); }
modalTableBody.appendChild(row);
});
// Initialize and show modal
if (!detailsModalInstance) {
detailsModalInstance = new bootstrap.Modal(document.getElementById('detailsModal'));
}
detailsModalInstance.show();
}
// --- Connect to SSE stream (Unchanged) ---
// --- Navigation Handling ---
function showSection(sectionId) {
console.log("Showing section:", sectionId);
document.getElementById('scada-content').style.display = 'none';
document.getElementById('drawings-content').style.display = 'none';
document.getElementById('conflicts-content').style.display = 'none';
const elementToShow = document.getElementById(`${sectionId}-content`);
if (elementToShow) {
elementToShow.style.display = 'block';
currentVisibleSection = sectionId;
// --- Trigger update for the now-visible section ---
// The update function will check visibility internally before drawing charts.
if (progressDetailsData && Object.keys(progressDetailsData).length > 0) {
const updateData = { progress: progressDetailsData }; // Pass existing data
console.log(`Calling update function for now-visible section: ${sectionId}`);
// Use setTimeout to ensure DOM update (display: block) is processed first
if (sectionId === 'scada') {
updateUIScadaCore(updateData);
} else if (sectionId === 'drawings') {
updateUIDrawingCore(updateData);
} else if (sectionId === 'conflicts') {
updateUIConflictsCore(updateData);
}
} else {
console.log(`Section ${sectionId} shown, but no progress data yet.`);
// If data arrives later, the debouncedProcessUpdate will handle drawing
// for the currently visible section.
}
// --- End section update trigger ---
} else {
console.error("Attempted to show unknown section:", sectionId);
document.getElementById('scada-content').style.display = 'block'; // Default back to SCADA
currentVisibleSection = 'scada';
}
// Update active nav link
document.querySelectorAll('.nav-link').forEach(link => {
link.classList.remove('active');
// Use href attribute to match sectionId
const targetSection = link.getAttribute('data-target-section');
if (targetSection === sectionId) {
link.classList.add('active');
}
});
}
document.addEventListener('DOMContentLoaded', () => {
console.log("DOM Loaded, setting up navigation...");
document.querySelectorAll('.nav-link').forEach(link => {
// Store target section ID in a data attribute from href
const href = link.getAttribute('href');
let targetSection = 'scada'; // Default
if (href === '/drawings') targetSection = 'drawings'; // Use plural to match ID
else if (href === '/conflicts') targetSection = 'conflicts'; // Use plural to match ID
link.setAttribute('data-target-section', targetSection);
link.addEventListener('click', (event) => {
event.preventDefault(); // Prevent page reload
const sectionId = link.getAttribute('data-target-section');
showSection(sectionId);
});
});
// Show initial section (SCADA by default)
showSection('scada');
});
// --- Connect to SSE stream (Single connection) ---
console.log("Initializing SSE connection...");
const eventSource = new EventSource("/stream");
eventSource.onmessage = function(event) {
console.log("SSE message received:", event.data);
try {
const data = JSON.parse(event.data);
updateUI(data); // Call the UI update function with the new data
debouncedProcessUpdate(data); // Call the single debounced processor
} catch (error) {
console.error("Error parsing SSE data:", error);
document.getElementById('status-message').textContent = 'Error processing update from server.';
@ -389,10 +653,9 @@
eventSource.onerror = function(err) {
console.error("EventSource failed:", err);
document.getElementById('status-message').textContent = 'Connection to server lost. Retrying...';
// Note: browser usually attempts reconnection automatically
};
// No need for initial fetch here, SSE stream sends initial state on connect
console.log("SSE handler set up.");
</script>
</body>

View File

@ -0,0 +1 @@
pip

View File

@ -0,0 +1,173 @@
Metadata-Version: 2.4
Name: pypdf
Version: 5.4.0
Summary: A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files
Author-email: Mathieu Fenniak <biziqe@mathieu.fenniak.net>
Maintainer: stefan6419846
Maintainer-email: Martin Thoma <info@martin-thoma.de>
Requires-Python: >=3.8
Description-Content-Type: text/markdown
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: BSD License
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: 3.13
Classifier: Operating System :: OS Independent
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Typing :: Typed
License-File: LICENSE
Requires-Dist: typing_extensions >= 4.0; python_version < '3.11'
Requires-Dist: cryptography ; extra == "crypto"
Requires-Dist: PyCryptodome ; extra == "cryptodome"
Requires-Dist: black ; extra == "dev"
Requires-Dist: flit ; extra == "dev"
Requires-Dist: pip-tools ; extra == "dev"
Requires-Dist: pre-commit<2.18.0 ; extra == "dev"
Requires-Dist: pytest-cov ; extra == "dev"
Requires-Dist: pytest-socket ; extra == "dev"
Requires-Dist: pytest-timeout ; extra == "dev"
Requires-Dist: pytest-xdist ; extra == "dev"
Requires-Dist: wheel ; extra == "dev"
Requires-Dist: myst_parser ; extra == "docs"
Requires-Dist: sphinx ; extra == "docs"
Requires-Dist: sphinx_rtd_theme ; extra == "docs"
Requires-Dist: cryptography ; extra == "full"
Requires-Dist: Pillow>=8.0.0 ; extra == "full"
Requires-Dist: Pillow>=8.0.0 ; extra == "image"
Project-URL: Bug Reports, https://github.com/py-pdf/pypdf/issues
Project-URL: Changelog, https://pypdf.readthedocs.io/en/latest/meta/CHANGELOG.html
Project-URL: Documentation, https://pypdf.readthedocs.io/en/latest/
Project-URL: Source, https://github.com/py-pdf/pypdf
Provides-Extra: crypto
Provides-Extra: cryptodome
Provides-Extra: dev
Provides-Extra: docs
Provides-Extra: full
Provides-Extra: image
[![PyPI version](https://badge.fury.io/py/pypdf.svg)](https://badge.fury.io/py/pypdf)
[![Python Support](https://img.shields.io/pypi/pyversions/pypdf.svg)](https://pypi.org/project/pypdf/)
[![](https://img.shields.io/badge/-documentation-green)](https://pypdf.readthedocs.io/en/stable/)
[![GitHub last commit](https://img.shields.io/github/last-commit/py-pdf/pypdf)](https://github.com/py-pdf/pypdf)
[![codecov](https://codecov.io/gh/py-pdf/pypdf/branch/main/graph/badge.svg?token=id42cGNZ5Z)](https://codecov.io/gh/py-pdf/pypdf)
# pypdf
pypdf is a free and open-source pure-python PDF library capable of splitting,
[merging](https://pypdf.readthedocs.io/en/stable/user/merging-pdfs.html),
[cropping, and transforming](https://pypdf.readthedocs.io/en/stable/user/cropping-and-transforming.html)
the pages of PDF files. It can also add
custom data, viewing options, and
[passwords](https://pypdf.readthedocs.io/en/stable/user/encryption-decryption.html)
to PDF files. pypdf can
[retrieve text](https://pypdf.readthedocs.io/en/stable/user/extract-text.html)
and
[metadata](https://pypdf.readthedocs.io/en/stable/user/metadata.html)
from PDFs as well.
See [pdfly](https://github.com/py-pdf/pdfly) for a CLI application that uses pypdf to interact with PDFs.
## Installation
Install pypdf using pip:
```
pip install pypdf
```
For using pypdf with AES encryption or decryption, install extra dependencies:
```
pip install pypdf[crypto]
```
> **NOTE**: `pypdf` 3.1.0 and above include significant improvements compared to
> previous versions. Please refer to [the migration
> guide](https://pypdf.readthedocs.io/en/latest/user/migration-1-to-2.html) for
> more information.
## Usage
```python
from pypdf import PdfReader
reader = PdfReader("example.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[0]
text = page.extract_text()
```
pypdf can do a lot more, e.g. splitting, merging, reading and creating
annotations, decrypting and encrypting, and more. Check out [the
documentation](https://pypdf.readthedocs.io/en/stable/) for additional usage
examples!
For questions and answers, visit
[StackOverflow](https://stackoverflow.com/questions/tagged/pypdf)
(tagged with [pypdf](https://stackoverflow.com/questions/tagged/pypdf)).
## Contributions
Maintaining pypdf is a collaborative effort. You can support the project by
writing documentation, helping to narrow down issues, and submitting code.
See the [CONTRIBUTING.md](https://github.com/py-pdf/pypdf/blob/main/CONTRIBUTING.md) file for more information.
### Q&A
The experience pypdf users have covers the whole range from beginners who
want to make their live easier to experts who developed software before PDF
existed. You can contribute to the pypdf community by answering questions
on [StackOverflow](https://stackoverflow.com/questions/tagged/pypdf),
helping in [discussions](https://github.com/py-pdf/pypdf/discussions),
and asking users who report issues for [MCVE](https://stackoverflow.com/help/minimal-reproducible-example)'s (Code + example PDF!).
### Issues
A good bug ticket includes a MCVE - a minimal complete verifiable example.
For pypdf, this means that you must upload a PDF that causes the bug to occur
as well as the code you're executing with all of the output. Use
`print(pypdf.__version__)` to tell us which version you're using.
### Code
All code contributions are welcome, but smaller ones have a better chance to
get included in a timely manner. Adding unit tests for new features or test
cases for bugs you've fixed help us to ensure that the Pull Request (PR) is fine.
pypdf includes a test suite which can be executed with `pytest`:
```bash
$ pytest
===================== test session starts =====================
platform linux -- Python 3.6.15, pytest-7.0.1, pluggy-1.0.0
rootdir: /home/moose/GitHub/Martin/pypdf
plugins: cov-3.0.0
collected 233 items
tests/test_basic_features.py .. [ 0%]
tests/test_constants.py . [ 1%]
tests/test_filters.py .................x..... [ 11%]
tests/test_generic.py ................................. [ 25%]
............. [ 30%]
tests/test_javascript.py .. [ 31%]
tests/test_merger.py . [ 32%]
tests/test_page.py ......................... [ 42%]
tests/test_pagerange.py ................ [ 49%]
tests/test_papersizes.py .................. [ 57%]
tests/test_reader.py .................................. [ 72%]
............... [ 78%]
tests/test_utils.py .................... [ 87%]
tests/test_workflows.py .......... [ 91%]
tests/test_writer.py ................. [ 98%]
tests/test_xmp.py ... [100%]
========== 232 passed, 1 xfailed, 1 warning in 4.52s ==========
```

View File

@ -0,0 +1,113 @@
pypdf-5.4.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
pypdf-5.4.0.dist-info/METADATA,sha256=E-D5PSflgLScgSvyNJcLdhpDBX4H0QUafueJFd7PDSA,7262
pypdf-5.4.0.dist-info/RECORD,,
pypdf-5.4.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
pypdf-5.4.0.dist-info/WHEEL,sha256=_2ozNFCLWc93bK4WKHCO-eDUENDlo-dgc9cU3qokYO4,82
pypdf-5.4.0.dist-info/licenses/LICENSE,sha256=qXrCMOXzPvEKU2eoUOsB-R8aCwZONHQsd5TSKUVX9SQ,1605
pypdf/__init__.py,sha256=WYkiisiLw4TrsrobuzUkEFGwAUbPF8V8ei_HJSdEJNY,1302
pypdf/__pycache__/__init__.cpython-312.pyc,,
pypdf/__pycache__/_cmap.cpython-312.pyc,,
pypdf/__pycache__/_doc_common.cpython-312.pyc,,
pypdf/__pycache__/_encryption.cpython-312.pyc,,
pypdf/__pycache__/_merger.cpython-312.pyc,,
pypdf/__pycache__/_page.cpython-312.pyc,,
pypdf/__pycache__/_page_labels.cpython-312.pyc,,
pypdf/__pycache__/_protocols.cpython-312.pyc,,
pypdf/__pycache__/_reader.cpython-312.pyc,,
pypdf/__pycache__/_utils.cpython-312.pyc,,
pypdf/__pycache__/_version.cpython-312.pyc,,
pypdf/__pycache__/_writer.cpython-312.pyc,,
pypdf/__pycache__/_xobj_image_helpers.cpython-312.pyc,,
pypdf/__pycache__/constants.cpython-312.pyc,,
pypdf/__pycache__/errors.cpython-312.pyc,,
pypdf/__pycache__/filters.cpython-312.pyc,,
pypdf/__pycache__/pagerange.cpython-312.pyc,,
pypdf/__pycache__/papersizes.cpython-312.pyc,,
pypdf/__pycache__/types.cpython-312.pyc,,
pypdf/__pycache__/xmp.cpython-312.pyc,,
pypdf/_cmap.py,sha256=Q4_EJC73QZ-0_I4jtLeHD-rkT5GASW9zehhNcums_0A,18642
pypdf/_codecs/__init__.py,sha256=WXMkzlMCDlmG5U6ixQk8MrYxaQeJxEfig5DTaGlklLk,1676
pypdf/_codecs/__pycache__/__init__.cpython-312.pyc,,
pypdf/_codecs/__pycache__/_codecs.cpython-312.pyc,,
pypdf/_codecs/__pycache__/adobe_glyphs.cpython-312.pyc,,
pypdf/_codecs/__pycache__/pdfdoc.cpython-312.pyc,,
pypdf/_codecs/__pycache__/std.cpython-312.pyc,,
pypdf/_codecs/__pycache__/symbol.cpython-312.pyc,,
pypdf/_codecs/__pycache__/zapfding.cpython-312.pyc,,
pypdf/_codecs/_codecs.py,sha256=zduPFkHbt9BjCpAc7Mx_rSOTEoSOZkUayr8EL5l82VM,9966
pypdf/_codecs/adobe_glyphs.py,sha256=t3cDFPDqwIz1w9B0gdVzjdc8eEK9AuRjk5f7laEw_fY,447213
pypdf/_codecs/pdfdoc.py,sha256=xfSvMFYsvxuaSQ0Uu9vZDKaB0Wu85h1uCiB1i9rAcUU,4269
pypdf/_codecs/std.py,sha256=DyQMuEpAGEpS9uy1jWf4cnj-kqShPOAij5sI7Q1YD8E,2630
pypdf/_codecs/symbol.py,sha256=nIaGQIlhWCJiPMHrwUlmGHH-_fOXyEKvguRmuKXcGAk,3734
pypdf/_codecs/zapfding.py,sha256=PQxjxRC616d41xF3exVxP1W8nM4QrZfjO3lmtLxpE_s,3742
pypdf/_crypt_providers/__init__.py,sha256=K3Z6AuXhXVeXgLet-Tukq2gt9H66OgdupsvxIS1CmkI,3054
pypdf/_crypt_providers/__pycache__/__init__.cpython-312.pyc,,
pypdf/_crypt_providers/__pycache__/_base.cpython-312.pyc,,
pypdf/_crypt_providers/__pycache__/_cryptography.cpython-312.pyc,,
pypdf/_crypt_providers/__pycache__/_fallback.cpython-312.pyc,,
pypdf/_crypt_providers/__pycache__/_pycryptodome.cpython-312.pyc,,
pypdf/_crypt_providers/_base.py,sha256=_f53Mj6vivhEZMQ4vNxN5G0IOgFY-n5_leke0c_qiNU,1711
pypdf/_crypt_providers/_cryptography.py,sha256=zT3WmbPzesvgHRkGcKAldqJ24MY3BwZViVbSc55Zxhw,4557
pypdf/_crypt_providers/_fallback.py,sha256=vsYoowR1YCAV_q-HrdIZhkUcrCb6HvRBNMYm03QtCU8,3334
pypdf/_crypt_providers/_pycryptodome.py,sha256=U1aQZ9iYBrZo-hKCjJUhGOPhwEFToiitowQ316TNrrA,3381
pypdf/_doc_common.py,sha256=lyM-6je3IbNfzL6gfYdFU2VvX3pkxj5AWHcEZRCFMQk,51871
pypdf/_encryption.py,sha256=pPg7fIfqdL96Tc6RVoBytEVjMrmZFecr_6l7dbtDFrE,48775
pypdf/_merger.py,sha256=YfSQKDiiQz2WtCmVZjxP_nv2pR2shiBf2tDiAb41c7s,1744
pypdf/_page.py,sha256=6Pts2harKZyD_qhKdbNjWLwy07Gw0QLTWIf_fAMENaA,102235
pypdf/_page_labels.py,sha256=nEU0knE7IRQ6LPhzwgw1RjJgm8WxXIfkmiHuv7ep2ow,8546
pypdf/_protocols.py,sha256=noE1y2fVE-z1wq-FkQzaS5exa8ovOFTUXqdQSvqi57c,2142
pypdf/_reader.py,sha256=tf8l66t8DmoeuZviN2YOdFHAwahnTu92ABAXiK9zCUA,51503
pypdf/_text_extraction/__init__.py,sha256=0zxSe5aXqO15dpOg5Q24FawupoTbvJCiHfBzGsWgpJE,8556
pypdf/_text_extraction/__pycache__/__init__.cpython-312.pyc,,
pypdf/_text_extraction/_layout_mode/__init__.py,sha256=k1tN46gDX1zhAatD8oTGMuCJUp-pgbHjyQ8H6axXRgU,338
pypdf/_text_extraction/_layout_mode/__pycache__/__init__.cpython-312.pyc,,
pypdf/_text_extraction/_layout_mode/__pycache__/_fixed_width_page.cpython-312.pyc,,
pypdf/_text_extraction/_layout_mode/__pycache__/_font.cpython-312.pyc,,
pypdf/_text_extraction/_layout_mode/__pycache__/_font_widths.cpython-312.pyc,,
pypdf/_text_extraction/_layout_mode/__pycache__/_text_state_manager.cpython-312.pyc,,
pypdf/_text_extraction/_layout_mode/__pycache__/_text_state_params.cpython-312.pyc,,
pypdf/_text_extraction/_layout_mode/_fixed_width_page.py,sha256=xXC6BwQvrOXMZmSKQ6UPnPtCnjjZ9jCCWTbEJ35E3ko,15424
pypdf/_text_extraction/_layout_mode/_font.py,sha256=F0uvly32AcFeTE4jBFg7JvuAQZSMUjO6HZgQYYFDQ40,7048
pypdf/_text_extraction/_layout_mode/_font_widths.py,sha256=Hfgsd2ftGw8Ajl7IcwNIlfLYnum-ekaadfwErcUdWtI,4265
pypdf/_text_extraction/_layout_mode/_text_state_manager.py,sha256=ugOJRALDNXW3snNAjKKKT8xmWt7D3GZZbcMVaGuVfFM,7989
pypdf/_text_extraction/_layout_mode/_text_state_params.py,sha256=b8DSoJ2easCZW_JvMl84WFFIANKGhLD1zjMVAlqScyU,5318
pypdf/_utils.py,sha256=h97CvvcQpxq7px__GzaMGzJWqJGZt2FYsZYR6wFiU3w,19300
pypdf/_version.py,sha256=xjYaBGUFGg0kGZj_WhuoFyPD8NILPsr79SaMwmYQGSg,22
pypdf/_writer.py,sha256=Kjrk1_uMUyZBlsze0qQhS-We90GIk3WtclKLzb373-s,128663
pypdf/_xobj_image_helpers.py,sha256=KVC80bgNcHBdqGEOfQbmQO4in6Foayt_lPTgSOgb-BA,14020
pypdf/annotations/__init__.py,sha256=f2k_-jAn39CCB27KxQ_e93GinnzkAHbUnnSeGJl1jyE,990
pypdf/annotations/__pycache__/__init__.cpython-312.pyc,,
pypdf/annotations/__pycache__/_base.cpython-312.pyc,,
pypdf/annotations/__pycache__/_markup_annotations.cpython-312.pyc,,
pypdf/annotations/__pycache__/_non_markup_annotations.cpython-312.pyc,,
pypdf/annotations/_base.py,sha256=7rQJyOMPtKkd_Yp2CXGT6KN17W3WOj8Albx6ehMki3w,916
pypdf/annotations/_markup_annotations.py,sha256=F4qMyS15OqXNLL9OTR5Wj2_4vO7ScG60yqNh-wayIFQ,10116
pypdf/annotations/_non_markup_annotations.py,sha256=qX51TJMTRUyWz1ogIK-cXXGK7k5oKhgYQhemA_sVxGE,3622
pypdf/constants.py,sha256=gwFz97ZB5j0Nn5R7LbWBUqBOcyEjIQRV7O598eLZSKc,20959
pypdf/errors.py,sha256=x0J5mTIbp5YcXA1pdYa5DO83uAhXP5NCO0Ankf4DsUY,1740
pypdf/filters.py,sha256=hT6e4odOa6WTpXYOxRm2r6fYOS2cocLsVdNPpjEPhn8,27869
pypdf/generic/__init__.py,sha256=nnLmD7bnhSJu1qZ774pj0eE7lmeRuYDEUcpa52-Mk5A,7168
pypdf/generic/__pycache__/__init__.cpython-312.pyc,,
pypdf/generic/__pycache__/_base.cpython-312.pyc,,
pypdf/generic/__pycache__/_data_structures.cpython-312.pyc,,
pypdf/generic/__pycache__/_files.cpython-312.pyc,,
pypdf/generic/__pycache__/_fit.cpython-312.pyc,,
pypdf/generic/__pycache__/_image_inline.cpython-312.pyc,,
pypdf/generic/__pycache__/_outline.cpython-312.pyc,,
pypdf/generic/__pycache__/_rectangle.cpython-312.pyc,,
pypdf/generic/__pycache__/_utils.cpython-312.pyc,,
pypdf/generic/__pycache__/_viewerpref.cpython-312.pyc,,
pypdf/generic/_base.py,sha256=u8oX747OyUZ5KPG8IYWUGD6lgeL-_MzWX0J-LsY0DjA,30885
pypdf/generic/_data_structures.py,sha256=kqIGv06r3p0BeUrmTePeFrEoB4v0LiulDvEkTt45TN8,63998
pypdf/generic/_files.py,sha256=UcyL_mCDpVh_dRuxxH8bENWA76rYt5eFw0emFcOE79Y,5655
pypdf/generic/_fit.py,sha256=lLkLgW0AQ36sVG4py-HXV__EPQYkLA1bNLoCwGJ_ijI,5511
pypdf/generic/_image_inline.py,sha256=OyP1GDpg-zgH-UWA--vsLIUriV_07-VqpFZ9mL31vl8,11447
pypdf/generic/_outline.py,sha256=qKbMX42OWfqnopIiE6BUy6EvdTLGe3ZtjaiWN85JpaY,1094
pypdf/generic/_rectangle.py,sha256=5KJRbQESqdzrYvJOFcwfp0_v_bhCDVj9r4yMyGXSGyc,3808
pypdf/generic/_utils.py,sha256=8T_2fGpRt9tZpN-06fa-7Wma9gFAkdtgJW2SuD7Yqfk,7415
pypdf/generic/_viewerpref.py,sha256=40YdivA2MAW6hTZEB-b_8Y84-tlNJNwXEusPmHMgS64,6739
pypdf/pagerange.py,sha256=9QqjrP6VrR2m8BN_sbbjZQ8Fi476xPpRiKqd8CxGoKM,6996
pypdf/papersizes.py,sha256=6Tz5sfNN_3JOUapY83U-lakohnpXYA0hSEQNmOVLFL8,1413
pypdf/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
pypdf/types.py,sha256=6B6pMncEhcqFfq-iKs5IBPg6guWXffU6YHpeYzCJH-s,1963
pypdf/xmp.py,sha256=0G9Gmb5lc7jdcGG-MYDSxYPg5P7SU_RswVRipuDY7lU,14246

View File

@ -0,0 +1,4 @@
Wheel-Version: 1.0
Generator: flit 3.11.0
Root-Is-Purelib: true
Tag: py3-none-any

View File

@ -0,0 +1,29 @@
Copyright (c) 2006-2008, Mathieu Fenniak
Some contributions copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
Some contributions copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* The name of the author may not be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,49 @@
"""
pypdf is a free and open-source pure-python PDF library capable of splitting,
merging, cropping, and transforming the pages of PDF files. It can also add
custom data, viewing options, and passwords to PDF files. pypdf can retrieve
text and metadata from PDFs as well.
You can read the full docs at https://pypdf.readthedocs.io/.
"""
from ._crypt_providers import crypt_provider
from ._doc_common import DocumentInformation
from ._encryption import PasswordType
from ._merger import PdfMerger
from ._page import PageObject, Transformation, mult
from ._reader import PdfReader
from ._version import __version__
from ._writer import ObjectDeletionFlag, PdfWriter
from .constants import ImageType
from .pagerange import PageRange, parse_filename_page_ranges
from .papersizes import PaperSize
try:
import PIL
pil_version = PIL.__version__
except ImportError:
pil_version = "none"
_debug_versions = (
f"pypdf=={__version__}, {crypt_provider=}, PIL={pil_version}"
)
__all__ = [
"DocumentInformation",
"ImageType",
"ObjectDeletionFlag",
"PageObject",
"PageRange",
"PaperSize",
"PasswordType",
"PdfMerger",
"PdfReader",
"PdfWriter",
"Transformation",
"__version__",
"_debug_versions",
"mult",
"parse_filename_page_ranges",
]

View File

@ -0,0 +1,544 @@
import binascii
from binascii import unhexlify
from math import ceil
from typing import Any, Dict, List, Tuple, Union, cast
from ._codecs import adobe_glyphs, charset_encoding
from ._utils import logger_error, logger_warning
from .generic import (
DecodedStreamObject,
DictionaryObject,
StreamObject,
is_null_or_none,
)
# code freely inspired from @twiggy ; see #711
def build_char_map(
font_name: str, space_width: float, obj: DictionaryObject
) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]:
"""
Determine information about a font.
Args:
font_name: font name as a string
space_width: default space width if no data is found.
obj: XObject or Page where you can find a /Resource dictionary
Returns:
Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.
The font-dictionary itself is suitable for the curious.
"""
ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(
space_width, ft
)
return font_subtype, font_halfspace, font_encoding, font_map, ft
def build_char_map_from_dict(
space_width: float, ft: DictionaryObject
) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]:
"""
Determine information about a font.
Args:
space_width: default space with if no data found
(normally half the width of a character).
ft: Font Dictionary
Returns:
Font sub-type, space_width criteria(50% of width), encoding, map character-map.
The font-dictionary itself is suitable for the curious.
"""
font_type = cast(str, ft["/Subtype"].get_object())
encoding, map_dict = get_encoding(ft)
space_key_char = get_actual_str_key(" ", encoding, map_dict)
font_width_map = build_font_width_map(ft, space_width * 2.0)
half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0
return (
font_type,
half_space_width,
encoding,
# https://github.com/python/mypy/issues/4374
map_dict
)
# used when missing data, e.g. font def missing
unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
"Unknown",
9999,
dict.fromkeys(range(256), "<EFBFBD>"),
{},
)
_predefined_cmap: Dict[str, str] = {
"/Identity-H": "utf-16-be",
"/Identity-V": "utf-16-be",
"/GB-EUC-H": "gbk",
"/GB-EUC-V": "gbk",
"/GBpc-EUC-H": "gb2312",
"/GBpc-EUC-V": "gb2312",
"/GBK-EUC-H": "gbk",
"/GBK-EUC-V": "gbk",
"/GBK2K-H": "gb18030",
"/GBK2K-V": "gb18030",
"/ETen-B5-H": "cp950",
"/ETen-B5-V": "cp950",
"/ETenms-B5-H": "cp950",
"/ETenms-B5-V": "cp950",
"/UniCNS-UTF16-H": "utf-16-be",
"/UniCNS-UTF16-V": "utf-16-be",
"/UniGB-UTF16-H": "gb18030",
"/UniGB-UTF16-V": "gb18030",
# UCS2 in code
}
# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
_default_fonts_space_width: Dict[str, int] = {
"/Courier": 600,
"/Courier-Bold": 600,
"/Courier-BoldOblique": 600,
"/Courier-Oblique": 600,
"/Helvetica": 278,
"/Helvetica-Bold": 278,
"/Helvetica-BoldOblique": 278,
"/Helvetica-Oblique": 278,
"/Helvetica-Narrow": 228,
"/Helvetica-NarrowBold": 228,
"/Helvetica-NarrowBoldOblique": 228,
"/Helvetica-NarrowOblique": 228,
"/Times-Roman": 250,
"/Times-Bold": 250,
"/Times-BoldItalic": 250,
"/Times-Italic": 250,
"/Symbol": 250,
"/ZapfDingbats": 278,
}
def get_encoding(
ft: DictionaryObject
) -> Tuple[Union[str, Dict[int, str]], Dict[Any, Any]]:
encoding = _parse_encoding(ft)
map_dict, int_entry = _parse_to_unicode(ft)
# Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
# if cmap not empty encoding should be discarded
# (here transformed into identity for those characters)
# If encoding is a string it is expected to be an identity translation.
if isinstance(encoding, dict):
for x in int_entry:
if x <= 255:
encoding[x] = chr(x)
return encoding, map_dict
def _parse_encoding(
ft: DictionaryObject
) -> Union[str, Dict[int, str]]:
encoding: Union[str, List[str], Dict[int, str]] = []
if "/Encoding" not in ft:
if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
encoding = dict(
zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
)
else:
encoding = "charmap"
return encoding
enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
if isinstance(enc, str):
try:
# already done : enc = NameObject.unnumber(enc.encode()).decode()
# for #xx decoding
if enc in charset_encoding:
encoding = charset_encoding[enc].copy()
elif enc in _predefined_cmap:
encoding = _predefined_cmap[enc]
elif "-UCS2-" in enc:
encoding = "utf-16-be"
else:
raise Exception("not found")
except Exception:
logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
encoding = enc
elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
try:
encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
except Exception:
logger_error(
f"Advanced encoding {encoding} not implemented yet",
__name__,
)
encoding = charset_encoding["/StandardEncoding"].copy()
else:
encoding = charset_encoding["/StandardEncoding"].copy()
if "/Differences" in enc:
x: int = 0
o: Union[int, str]
for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]):
if isinstance(o, int):
x = o
else: # isinstance(o,str):
try:
if x < len(encoding):
encoding[x] = adobe_glyphs[o] # type: ignore
except Exception:
encoding[x] = o # type: ignore
x += 1
if isinstance(encoding, list):
encoding = dict(zip(range(256), encoding))
return encoding
def _parse_to_unicode(
ft: DictionaryObject
) -> Tuple[Dict[Any, Any], List[int]]:
# will store all translation code
# and map_dict[-1] we will have the number of bytes to convert
map_dict: Dict[Any, Any] = {}
# will provide the list of cmap keys as int to correct encoding
int_entry: List[int] = []
if "/ToUnicode" not in ft:
if ft.get("/Subtype", "") == "/Type1":
return _type1_alternative(ft, map_dict, int_entry)
else:
return {}, []
process_rg: bool = False
process_char: bool = False
multiline_rg: Union[
None, Tuple[int, int]
] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
cm = prepare_cm(ft)
for line in cm.split(b"\n"):
process_rg, process_char, multiline_rg = process_cm_line(
line.strip(b" \t"),
process_rg,
process_char,
multiline_rg,
map_dict,
int_entry,
)
return map_dict, int_entry
def get_actual_str_key(
value_char: str, encoding: Union[str, Dict[int, str]], map_dict: Dict[Any, Any]
) -> str:
key_dict = {}
if isinstance(encoding, dict):
key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char}
else:
key_dict = {value: key for key, value in map_dict.items() if value == value_char}
key_char = key_dict.get(value_char, value_char)
return key_char
def prepare_cm(ft: DictionaryObject) -> bytes:
tu = ft["/ToUnicode"]
cm: bytes
if isinstance(tu, StreamObject):
cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
# the full range 0000-FFFF will be processed
cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
if isinstance(cm, str):
cm = cm.encode()
# we need to prepare cm before due to missing return line in pdf printed
# to pdf from word
cm = (
cm.strip()
.replace(b"beginbfchar", b"\nbeginbfchar\n")
.replace(b"endbfchar", b"\nendbfchar\n")
.replace(b"beginbfrange", b"\nbeginbfrange\n")
.replace(b"endbfrange", b"\nendbfrange\n")
.replace(b"<<", b"\n{\n") # text between << and >> not used but
.replace(b">>", b"\n}\n") # some solution to find it back
)
ll = cm.split(b"<")
for i in range(len(ll)):
j = ll[i].find(b">")
if j >= 0:
if j == 0:
# string is empty: stash a placeholder here (see below)
# see https://github.com/py-pdf/pypdf/issues/1111
content = b"."
else:
content = ll[i][:j].replace(b" ", b"")
ll[i] = content + b" " + ll[i][j + 1 :]
cm = (
(b" ".join(ll))
.replace(b"[", b" [ ")
.replace(b"]", b" ]\n ")
.replace(b"\r", b"\n")
)
return cm
def process_cm_line(
line: bytes,
process_rg: bool,
process_char: bool,
multiline_rg: Union[None, Tuple[int, int]],
map_dict: Dict[Any, Any],
int_entry: List[int],
) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
if line == b"" or line[0] == 37: # 37 = %
return process_rg, process_char, multiline_rg
line = line.replace(b"\t", b" ")
if b"beginbfrange" in line:
process_rg = True
elif b"endbfrange" in line:
process_rg = False
elif b"beginbfchar" in line:
process_char = True
elif b"endbfchar" in line:
process_char = False
elif process_rg:
try:
multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
except binascii.Error as error:
logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
elif process_char:
parse_bfchar(line, map_dict, int_entry)
return process_rg, process_char, multiline_rg
def parse_bfrange(
line: bytes,
map_dict: Dict[Any, Any],
int_entry: List[int],
multiline_rg: Union[None, Tuple[int, int]],
) -> Union[None, Tuple[int, int]]:
lst = [x for x in line.split(b" ") if x]
closure_found = False
if multiline_rg is not None:
fmt = b"%%0%dX" % (map_dict[-1] * 2)
a = multiline_rg[0] # a, b not in the current line
b = multiline_rg[1]
for sq in lst[0:]:
if sq == b"]":
closure_found = True
break
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
else:
a = int(lst[0], 16)
b = int(lst[1], 16)
nbi = max(len(lst[0]), len(lst[1]))
map_dict[-1] = ceil(nbi / 2)
fmt = b"%%0%dX" % (map_dict[-1] * 2)
if lst[2] == b"[":
for sq in lst[3:]:
if sq == b"]":
closure_found = True
break
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
else: # case without list
c = int(lst[2], 16)
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
closure_found = True
while a <= b:
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
c += 1
return None if closure_found else (a, b)
def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
lst = [x for x in line.split(b" ") if x]
map_dict[-1] = len(lst[0]) // 2
while len(lst) > 1:
map_to = ""
# placeholder (see above) means empty string
if lst[1] != b".":
map_to = unhexlify(lst[1]).decode(
"charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
) # join is here as some cases where the code was split
map_dict[
unhexlify(lst[0]).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
)
] = map_to
int_entry.append(int(lst[0], 16))
lst = lst[2:]
def build_font_width_map(
ft: DictionaryObject, default_font_width: float
) -> Dict[Any, float]:
font_width_map: Dict[Any, float] = {}
st: int = 0
en: int = 0
try:
default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object())] * 2.0
except KeyError:
pass
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
# §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
# Widths for a CIDFont are defined using the DW and W entries.
# DW2 and W2 are for vertical use. Vertical type is not implemented.
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
if "/DW" in ft1:
font_width_map["default"] = cast(float, ft1["/DW"].get_object())
else:
font_width_map["default"] = default_font_width
if "/W" in ft1:
w = ft1["/W"].get_object()
else:
w = []
while len(w) > 0:
st = w[0] if isinstance(w[0], int) else w[0].get_object()
second = w[1].get_object()
if isinstance(second, int):
# C_first C_last same_W
en = second
width = w[2].get_object()
if not isinstance(width, (int, float)):
logger_warning(f"Expected numeric value for width, got {width}. Ignoring it.", __name__)
w = w[3:]
continue
for c_code in range(st, en + 1):
font_width_map[chr(c_code)] = width
w = w[3:]
elif isinstance(second, list):
# Starting_C [W1 W2 ... Wn]
c_code = st
for ww in second:
width = ww.get_object()
font_width_map[chr(c_code)] = width
c_code += 1
w = w[2:]
else:
logger_warning(
"unknown widths : \n" + (ft1["/W"]).__repr__(),
__name__,
)
break
elif "/Widths" in ft:
w = ft["/Widths"].get_object()
if "/FontDescriptor" in ft and "/MissingWidth" in cast(
DictionaryObject, ft["/FontDescriptor"]
):
font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
else:
# will consider width of char as avg(width)
m = 0
cpt = 0
for xx in w:
xx = xx.get_object()
if xx > 0:
m += xx
cpt += 1
font_width_map["default"] = m / max(1, cpt)
st = cast(int, ft["/FirstChar"])
en = cast(int, ft["/LastChar"])
for c_code in range(st, en + 1):
try:
width = w[c_code - st].get_object()
font_width_map[chr(c_code)] = width
except (IndexError, KeyError):
# The PDF structure is invalid. The array is too small
# for the specified font width.
pass
if is_null_or_none(font_width_map.get("default")):
font_width_map["default"] = default_font_width if default_font_width else 0.0
return font_width_map
def compute_space_width(
font_width_map: Dict[Any, float], space_char: str
) -> float:
try:
sp_width = font_width_map[space_char]
if sp_width == 0:
raise ValueError("Zero width")
except (KeyError, ValueError):
sp_width = (
font_width_map["default"] / 2.0
) # if using default we consider space will be only half size
return sp_width
def compute_font_width(
font_width_map: Dict[Any, float],
char: str
) -> float:
char_width: float = 0.0
try:
char_width = font_width_map[char]
except KeyError:
char_width = (
font_width_map["default"]
)
return char_width
def _type1_alternative(
ft: DictionaryObject,
map_dict: Dict[Any, Any],
int_entry: List[int],
) -> Tuple[Dict[Any, Any], List[int]]:
if "/FontDescriptor" not in ft:
return map_dict, int_entry
ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
if is_null_or_none(ft_desc):
return map_dict, int_entry
assert ft_desc is not None, "mypy"
txt = ft_desc.get_object().get_data()
txt = txt.split(b"eexec\n")[0] # only clear part
txt = txt.split(b"/Encoding")[1] # to get the encoding part
lines = txt.replace(b"\r", b"\n").split(b"\n")
for li in lines:
if li.startswith(b"dup"):
words = [_w for _w in li.split(b" ") if _w != b""]
if len(words) > 3 and words[3] != b"put":
continue
try:
i = int(words[1])
except ValueError: # pragma: no cover
continue
try:
v = adobe_glyphs[words[2].decode()]
except KeyError:
if words[2].startswith(b"/uni"):
try:
v = chr(int(words[2][4:], 16))
except ValueError: # pragma: no cover
continue
else:
continue
map_dict[chr(i)] = v
int_entry.append(i)
return map_dict, int_entry

View File

@ -0,0 +1,61 @@
from typing import Dict, List
from .adobe_glyphs import adobe_glyphs
from .pdfdoc import _pdfdoc_encoding
from .std import _std_encoding
from .symbol import _symbol_encoding
from .zapfding import _zapfding_encoding
def fill_from_encoding(enc: str) -> List[str]:
lst: List[str] = []
for x in range(256):
try:
lst += (bytes((x,)).decode(enc),)
except Exception:
lst += (chr(x),)
return lst
def rev_encoding(enc: List[str]) -> Dict[str, int]:
rev: Dict[str, int] = {}
for i in range(256):
char = enc[i]
if char == "\u0000":
continue
assert char not in rev, f"{char} at {i} already at {rev[char]}"
rev[char] = i
return rev
_win_encoding = fill_from_encoding("cp1252")
_mac_encoding = fill_from_encoding("mac_roman")
_win_encoding_rev: Dict[str, int] = rev_encoding(_win_encoding)
_mac_encoding_rev: Dict[str, int] = rev_encoding(_mac_encoding)
_symbol_encoding_rev: Dict[str, int] = rev_encoding(_symbol_encoding)
_zapfding_encoding_rev: Dict[str, int] = rev_encoding(_zapfding_encoding)
_pdfdoc_encoding_rev: Dict[str, int] = rev_encoding(_pdfdoc_encoding)
charset_encoding: Dict[str, List[str]] = {
"/StandardEncoding": _std_encoding,
"/WinAnsiEncoding": _win_encoding,
"/MacRomanEncoding": _mac_encoding,
"/PDFDocEncoding": _pdfdoc_encoding,
"/Symbol": _symbol_encoding,
"/ZapfDingbats": _zapfding_encoding,
}
__all__ = [
"_mac_encoding",
"_pdfdoc_encoding",
"_pdfdoc_encoding_rev",
"_std_encoding",
"_symbol_encoding",
"_win_encoding",
"_zapfding_encoding",
"adobe_glyphs",
"charset_encoding",
]

View File

@ -0,0 +1,268 @@
"""
This module is for codecs only.
While the codec implementation can contain details of the PDF specification,
the module should not do any PDF parsing.
"""
import io
from abc import ABC, abstractmethod
from typing import Dict, List
from pypdf._utils import logger_warning
class Codec(ABC):
"""Abstract base class for all codecs."""
@abstractmethod
def encode(self, data: bytes) -> bytes:
"""
Encode the input data.
Args:
data: Data to encode.
Returns:
Encoded data.
"""
@abstractmethod
def decode(self, data: bytes) -> bytes:
"""
Decode the input data.
Args:
data: Data to decode.
Returns:
Decoded data.
"""
class LzwCodec(Codec):
"""Lempel-Ziv-Welch (LZW) adaptive compression codec."""
CLEAR_TABLE_MARKER = 256 # Special code to indicate table reset
EOD_MARKER = 257 # End-of-data marker
INITIAL_BITS_PER_CODE = 9 # Initial code bit width
MAX_BITS_PER_CODE = 12 # Maximum code bit width
def _initialize_encoding_table(self) -> None:
"""Initialize the encoding table and state to initial conditions."""
self.encoding_table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
self.next_code = self.EOD_MARKER + 1
self.bits_per_code = self.INITIAL_BITS_PER_CODE
self.max_code_value = (1 << self.bits_per_code) - 1
def _increase_next_code(self) -> None:
"""Update bits_per_code and max_code_value if necessary."""
self.next_code += 1
if (
self.next_code > self.max_code_value
and self.bits_per_code < self.MAX_BITS_PER_CODE
):
self.bits_per_code += 1
self.max_code_value = (1 << self.bits_per_code) - 1
def encode(self, data: bytes) -> bytes:
"""
Encode data using the LZW compression algorithm.
Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding".
"""
result_codes: List[int] = []
# The encoder shall begin by issuing a clear-table code
result_codes.append(self.CLEAR_TABLE_MARKER)
self._initialize_encoding_table()
current_sequence = b""
for byte in data:
next_sequence = current_sequence + bytes([byte])
if next_sequence in self.encoding_table:
# Extend current sequence if already in the table
current_sequence = next_sequence
else:
# Output code for the current sequence
result_codes.append(self.encoding_table[current_sequence])
# Add the new sequence to the table if there's room
if self.next_code <= (1 << self.MAX_BITS_PER_CODE) - 1:
self.encoding_table[next_sequence] = self.next_code
self._increase_next_code()
else:
# If the table is full, emit a clear-table command
result_codes.append(self.CLEAR_TABLE_MARKER)
self._initialize_encoding_table()
# Start new sequence
current_sequence = bytes([byte])
# Ensure everything actually is encoded
if current_sequence:
result_codes.append(self.encoding_table[current_sequence])
result_codes.append(self.EOD_MARKER)
return self._pack_codes_into_bytes(result_codes)
def _pack_codes_into_bytes(self, codes: List[int]) -> bytes:
"""
Convert the list of result codes into a continuous byte stream, with codes packed as per the code bit-width.
The bit-width starts at 9 bits and expands as needed.
"""
self._initialize_encoding_table()
buffer = 0
bits_in_buffer = 0
output = bytearray()
for code in codes:
buffer = (buffer << self.bits_per_code) | code
bits_in_buffer += self.bits_per_code
# Codes shall be packed into a continuous bit stream, high-order bit
# first. This stream shall then be divided into bytes, high-order bit
# first.
while bits_in_buffer >= 8:
bits_in_buffer -= 8
output.append((buffer >> bits_in_buffer) & 0xFF)
if code == self.CLEAR_TABLE_MARKER:
self._initialize_encoding_table()
elif code == self.EOD_MARKER:
continue
else:
self._increase_next_code()
# Flush any remaining bits in the buffer
if bits_in_buffer > 0:
output.append((buffer << (8 - bits_in_buffer)) & 0xFF)
return bytes(output)
def _initialize_decoding_table(self) -> None:
self.max_code_value = (1 << self.MAX_BITS_PER_CODE) - 1
self.decoding_table = [bytes([i]) for i in range(self.CLEAR_TABLE_MARKER)] + [
b""
] * (self.max_code_value - self.CLEAR_TABLE_MARKER + 1)
self._table_index = self.EOD_MARKER + 1
self._bits_to_get = 9
def _next_code_decode(self, data: bytes) -> int:
self._next_data: int
try:
while self._next_bits < self._bits_to_get:
self._next_data = (self._next_data << 8) | (
data[self._byte_pointer] & 0xFF
)
self._byte_pointer += 1
self._next_bits += 8
code = (
self._next_data >> (self._next_bits - self._bits_to_get)
) & self._and_table[self._bits_to_get - 9]
self._next_bits -= self._bits_to_get
return code
except IndexError:
return self.EOD_MARKER
# The following method has been converted to Python from PDFsharp:
# https://github.com/empira/PDFsharp/blob/5fbf6ed14740bc4e16786816882d32e43af3ff5d/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
#
# Original license:
#
# -------------------------------------------------------------------------
# Copyright (c) 2001-2024 empira Software GmbH, Troisdorf (Cologne Area),
# Germany
#
# http://docs.pdfsharp.net
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
# --------------------------------------------------------------------------
def decode(self, data: bytes) -> bytes:
"""
The following code was converted to Python from the following code:
https://github.com/empira/PDFsharp/blob/master/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
"""
self._and_table = [511, 1023, 2047, 4095]
self._table_index = 0
self._bits_to_get = 9
self._byte_pointer = 0
self._next_data = 0
self._next_bits = 0
output_stream = io.BytesIO()
self._initialize_decoding_table()
self._byte_pointer = 0
self._next_data = 0
self._next_bits = 0
old_code = self.CLEAR_TABLE_MARKER
while True:
code = self._next_code_decode(data)
if code == self.EOD_MARKER:
break
if code == self.CLEAR_TABLE_MARKER:
self._initialize_decoding_table()
code = self._next_code_decode(data)
if code == self.EOD_MARKER:
break
output_stream.write(self.decoding_table[code])
old_code = code
elif code < self._table_index:
string = self.decoding_table[code]
output_stream.write(string)
if old_code != self.CLEAR_TABLE_MARKER:
self._add_entry_decode(self.decoding_table[old_code], string[0])
old_code = code
else:
# The code is not in the table and not one of the special codes
string = (
self.decoding_table[old_code] + self.decoding_table[old_code][:1]
)
output_stream.write(string)
self._add_entry_decode(self.decoding_table[old_code], string[0])
old_code = code
output = output_stream.getvalue()
return output
def _add_entry_decode(self, old_string: bytes, new_char: int) -> None:
new_string = old_string + bytes([new_char])
if self._table_index > self.max_code_value:
logger_warning("Ignoring too large LZW table index.", __name__)
return
self.decoding_table[self._table_index] = new_string
self._table_index += 1
# Update the number of bits to get based on the table index
if self._table_index == 511:
self._bits_to_get = 10
elif self._table_index == 1023:
self._bits_to_get = 11
elif self._table_index == 2047:
self._bits_to_get = 12

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,264 @@
# PDFDocEncoding Character Set: Table D.2 of PDF Reference 1.7
# C.1 Predefined encodings sorted by character name of another PDF reference
# Some indices have '\u0000' although they should have something else:
# 22: should be '\u0017'
_pdfdoc_encoding = [
"\u0000",
"\u0001",
"\u0002",
"\u0003",
"\u0004",
"\u0005",
"\u0006",
"\u0007", # 0 - 7
"\u0008",
"\u0009",
"\u000a",
"\u000b",
"\u000c",
"\u000d",
"\u000e",
"\u000f", # 8 - 15
"\u0010",
"\u0011",
"\u0012",
"\u0013",
"\u0014",
"\u0015",
"\u0000",
"\u0017", # 16 - 23
"\u02d8",
"\u02c7",
"\u02c6",
"\u02d9",
"\u02dd",
"\u02db",
"\u02da",
"\u02dc", # 24 - 31
"\u0020",
"\u0021",
"\u0022",
"\u0023",
"\u0024",
"\u0025",
"\u0026",
"\u0027", # 32 - 39
"\u0028",
"\u0029",
"\u002a",
"\u002b",
"\u002c",
"\u002d",
"\u002e",
"\u002f", # 40 - 47
"\u0030",
"\u0031",
"\u0032",
"\u0033",
"\u0034",
"\u0035",
"\u0036",
"\u0037", # 48 - 55
"\u0038",
"\u0039",
"\u003a",
"\u003b",
"\u003c",
"\u003d",
"\u003e",
"\u003f", # 56 - 63
"\u0040",
"\u0041",
"\u0042",
"\u0043",
"\u0044",
"\u0045",
"\u0046",
"\u0047", # 64 - 71
"\u0048",
"\u0049",
"\u004a",
"\u004b",
"\u004c",
"\u004d",
"\u004e",
"\u004f", # 72 - 79
"\u0050",
"\u0051",
"\u0052",
"\u0053",
"\u0054",
"\u0055",
"\u0056",
"\u0057", # 80 - 87
"\u0058",
"\u0059",
"\u005a",
"\u005b",
"\u005c",
"\u005d",
"\u005e",
"\u005f", # 88 - 95
"\u0060",
"\u0061",
"\u0062",
"\u0063",
"\u0064",
"\u0065",
"\u0066",
"\u0067", # 96 - 103
"\u0068",
"\u0069",
"\u006a",
"\u006b",
"\u006c",
"\u006d",
"\u006e",
"\u006f", # 104 - 111
"\u0070",
"\u0071",
"\u0072",
"\u0073",
"\u0074",
"\u0075",
"\u0076",
"\u0077", # 112 - 119
"\u0078",
"\u0079",
"\u007a",
"\u007b",
"\u007c",
"\u007d",
"\u007e",
"\u0000", # 120 - 127
"\u2022",
"\u2020",
"\u2021",
"\u2026",
"\u2014",
"\u2013",
"\u0192",
"\u2044", # 128 - 135
"\u2039",
"\u203a",
"\u2212",
"\u2030",
"\u201e",
"\u201c",
"\u201d",
"\u2018", # 136 - 143
"\u2019",
"\u201a",
"\u2122",
"\ufb01",
"\ufb02",
"\u0141",
"\u0152",
"\u0160", # 144 - 151
"\u0178",
"\u017d",
"\u0131",
"\u0142",
"\u0153",
"\u0161",
"\u017e",
"\u0000", # 152 - 159
"\u20ac",
"\u00a1",
"\u00a2",
"\u00a3",
"\u00a4",
"\u00a5",
"\u00a6",
"\u00a7", # 160 - 167
"\u00a8",
"\u00a9",
"\u00aa",
"\u00ab",
"\u00ac",
"\u0000",
"\u00ae",
"\u00af", # 168 - 175
"\u00b0",
"\u00b1",
"\u00b2",
"\u00b3",
"\u00b4",
"\u00b5",
"\u00b6",
"\u00b7", # 176 - 183
"\u00b8",
"\u00b9",
"\u00ba",
"\u00bb",
"\u00bc",
"\u00bd",
"\u00be",
"\u00bf", # 184 - 191
"\u00c0",
"\u00c1",
"\u00c2",
"\u00c3",
"\u00c4",
"\u00c5",
"\u00c6",
"\u00c7", # 192 - 199
"\u00c8",
"\u00c9",
"\u00ca",
"\u00cb",
"\u00cc",
"\u00cd",
"\u00ce",
"\u00cf", # 200 - 207
"\u00d0",
"\u00d1",
"\u00d2",
"\u00d3",
"\u00d4",
"\u00d5",
"\u00d6",
"\u00d7", # 208 - 215
"\u00d8",
"\u00d9",
"\u00da",
"\u00db",
"\u00dc",
"\u00dd",
"\u00de",
"\u00df", # 216 - 223
"\u00e0",
"\u00e1",
"\u00e2",
"\u00e3",
"\u00e4",
"\u00e5",
"\u00e6",
"\u00e7", # 224 - 231
"\u00e8",
"\u00e9",
"\u00ea",
"\u00eb",
"\u00ec",
"\u00ed",
"\u00ee",
"\u00ef", # 232 - 239
"\u00f0",
"\u00f1",
"\u00f2",
"\u00f3",
"\u00f4",
"\u00f5",
"\u00f6",
"\u00f7", # 240 - 247
"\u00f8",
"\u00f9",
"\u00fa",
"\u00fb",
"\u00fc",
"\u00fd",
"\u00fe",
"\u00ff", # 248 - 255
]
assert len(_pdfdoc_encoding) == 256

View File

@ -0,0 +1,258 @@
_std_encoding = [
"\x00",
"\x01",
"\x02",
"\x03",
"\x04",
"\x05",
"\x06",
"\x07",
"\x08",
"\t",
"\n",
"\x0b",
"\x0c",
"\r",
"\x0e",
"\x0f",
"\x10",
"\x11",
"\x12",
"\x13",
"\x14",
"\x15",
"\x16",
"\x17",
"\x18",
"\x19",
"\x1a",
"\x1b",
"\x1c",
"\x1d",
"\x1e",
"\x1f",
" ",
"!",
'"',
"#",
"$",
"%",
"&",
"",
"(",
")",
"*",
"+",
",",
"-",
".",
"/",
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
":",
";",
"<",
"=",
">",
"?",
"@",
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"J",
"K",
"L",
"M",
"N",
"O",
"P",
"Q",
"R",
"S",
"T",
"U",
"V",
"W",
"X",
"Y",
"Z",
"[",
"\\",
"]",
"^",
"_",
"",
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"{",
"|",
"}",
"~",
"\x7f",
"\x80",
"\x81",
"\x82",
"\x83",
"\x84",
"\x85",
"\x86",
"\x87",
"\x88",
"\x89",
"\x8a",
"\x8b",
"\x8c",
"\x8d",
"\x8e",
"\x8f",
"\x90",
"\x91",
"\x92",
"\x93",
"\x94",
"\x95",
"\x96",
"\x97",
"\x98",
"\x99",
"\x9a",
"\x9b",
"\x9c",
"\x9d",
"\x9e",
"\x9f",
"\xa0",
"¡",
"¢",
"£",
"",
"¥",
"ƒ",
"§",
"¤",
"'",
"",
"«",
"",
"",
"",
"",
"°",
"",
"",
"",
"·",
"µ",
"",
"",
"",
"",
"",
"»",
"",
"",
"¾",
"¿",
"À",
"`",
"´",
"ˆ",
"˜",
"¯",
"˘",
"˙",
"¨",
"É",
"˚",
"¸",
"Ì",
"˝",
"˛",
"ˇ",
"",
"Ñ",
"Ò",
"Ó",
"Ô",
"Õ",
"Ö",
"×",
"Ø",
"Ù",
"Ú",
"Û",
"Ü",
"Ý",
"Þ",
"ß",
"à",
"Æ",
"â",
"ª",
"ä",
"å",
"æ",
"ç",
"Ł",
"Ø",
"Œ",
"º",
"ì",
"í",
"î",
"ï",
"ð",
"æ",
"ò",
"ó",
"ô",
"ı",
"ö",
"÷",
"ł",
"ø",
"œ",
"ß",
"ü",
"ý",
"þ",
"ÿ",
]

View File

@ -0,0 +1,260 @@
# manually generated from https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/symbol.txt
_symbol_encoding = [
"\u0000",
"\u0001",
"\u0002",
"\u0003",
"\u0004",
"\u0005",
"\u0006",
"\u0007",
"\u0008",
"\u0009",
"\u000A",
"\u000B",
"\u000C",
"\u000D",
"\u000E",
"\u000F",
"\u0010",
"\u0011",
"\u0012",
"\u0013",
"\u0014",
"\u0015",
"\u0016",
"\u0017",
"\u0018",
"\u0019",
"\u001A",
"\u001B",
"\u001C",
"\u001D",
"\u001E",
"\u001F",
"\u0020",
"\u0021",
"\u2200",
"\u0023",
"\u2203",
"\u0025",
"\u0026",
"\u220B",
"\u0028",
"\u0029",
"\u2217",
"\u002B",
"\u002C",
"\u2212",
"\u002E",
"\u002F",
"\u0030",
"\u0031",
"\u0032",
"\u0033",
"\u0034",
"\u0035",
"\u0036",
"\u0037",
"\u0038",
"\u0039",
"\u003A",
"\u003B",
"\u003C",
"\u003D",
"\u003E",
"\u003F",
"\u2245",
"\u0391",
"\u0392",
"\u03A7",
"\u0394",
"\u0395",
"\u03A6",
"\u0393",
"\u0397",
"\u0399",
"\u03D1",
"\u039A",
"\u039B",
"\u039C",
"\u039D",
"\u039F",
"\u03A0",
"\u0398",
"\u03A1",
"\u03A3",
"\u03A4",
"\u03A5",
"\u03C2",
"\u03A9",
"\u039E",
"\u03A8",
"\u0396",
"\u005B",
"\u2234",
"\u005D",
"\u22A5",
"\u005F",
"\uF8E5",
"\u03B1",
"\u03B2",
"\u03C7",
"\u03B4",
"\u03B5",
"\u03C6",
"\u03B3",
"\u03B7",
"\u03B9",
"\u03D5",
"\u03BA",
"\u03BB",
"\u00B5",
"\u03BD",
"\u03BF",
"\u03C0",
"\u03B8",
"\u03C1",
"\u03C3",
"\u03C4",
"\u03C5",
"\u03D6",
"\u03C9",
"\u03BE",
"\u03C8",
"\u03B6",
"\u007B",
"\u007C",
"\u007D",
"\u223C",
"\u007F",
"\u0080",
"\u0081",
"\u0082",
"\u0083",
"\u0084",
"\u0085",
"\u0086",
"\u0087",
"\u0088",
"\u0089",
"\u008A",
"\u008B",
"\u008C",
"\u008D",
"\u008E",
"\u008F",
"\u0090",
"\u0091",
"\u0092",
"\u0093",
"\u0094",
"\u0095",
"\u0096",
"\u0097",
"\u0098",
"\u0099",
"\u009A",
"\u009B",
"\u009C",
"\u009D",
"\u009E",
"\u009F",
"\u20AC",
"\u03D2",
"\u2032",
"\u2264",
"\u2044",
"\u221E",
"\u0192",
"\u2663",
"\u2666",
"\u2665",
"\u2660",
"\u2194",
"\u2190",
"\u2191",
"\u2192",
"\u2193",
"\u00B0",
"\u00B1",
"\u2033",
"\u2265",
"\u00D7",
"\u221D",
"\u2202",
"\u2022",
"\u00F7",
"\u2260",
"\u2261",
"\u2248",
"\u2026",
"\uF8E6",
"\uF8E7",
"\u21B5",
"\u2135",
"\u2111",
"\u211C",
"\u2118",
"\u2297",
"\u2295",
"\u2205",
"\u2229",
"\u222A",
"\u2283",
"\u2287",
"\u2284",
"\u2282",
"\u2286",
"\u2208",
"\u2209",
"\u2220",
"\u2207",
"\uF6DA",
"\uF6D9",
"\uF6DB",
"\u220F",
"\u221A",
"\u22C5",
"\u00AC",
"\u2227",
"\u2228",
"\u21D4",
"\u21D0",
"\u21D1",
"\u21D2",
"\u21D3",
"\u25CA",
"\u2329",
"\uF8E8",
"\uF8E9",
"\uF8EA",
"\u2211",
"\uF8EB",
"\uF8EC",
"\uF8ED",
"\uF8EE",
"\uF8EF",
"\uF8F0",
"\uF8F1",
"\uF8F2",
"\uF8F3",
"\uF8F4",
"\u00F0",
"\u232A",
"\u222B",
"\u2320",
"\uF8F5",
"\u2321",
"\uF8F6",
"\uF8F7",
"\uF8F8",
"\uF8F9",
"\uF8FA",
"\uF8FB",
"\uF8FC",
"\uF8FD",
"\uF8FE",
"\u00FF",
]
assert len(_symbol_encoding) == 256

View File

@ -0,0 +1,261 @@
# manually generated from https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/zdingbat.txt
_zapfding_encoding = [
"\u0000",
"\u0001",
"\u0002",
"\u0003",
"\u0004",
"\u0005",
"\u0006",
"\u0007",
"\u0008",
"\u0009",
"\u000A",
"\u000B",
"\u000C",
"\u000D",
"\u000E",
"\u000F",
"\u0010",
"\u0011",
"\u0012",
"\u0013",
"\u0014",
"\u0015",
"\u0016",
"\u0017",
"\u0018",
"\u0019",
"\u001A",
"\u001B",
"\u001C",
"\u001D",
"\u001E",
"\u001F",
"\u0020",
"\u2701",
"\u2702",
"\u2703",
"\u2704",
"\u260E",
"\u2706",
"\u2707",
"\u2708",
"\u2709",
"\u261B",
"\u261E",
"\u270C",
"\u270D",
"\u270E",
"\u270F",
"\u2710",
"\u2711",
"\u2712",
"\u2713",
"\u2714",
"\u2715",
"\u2716",
"\u2717",
"\u2718",
"\u2719",
"\u271A",
"\u271B",
"\u271C",
"\u271D",
"\u271E",
"\u271F",
"\u2720",
"\u2721",
"\u2722",
"\u2723",
"\u2724",
"\u2725",
"\u2726",
"\u2727",
"\u2605",
"\u2729",
"\u272A",
"\u272B",
"\u272C",
"\u272D",
"\u272E",
"\u272F",
"\u2730",
"\u2731",
"\u2732",
"\u2733",
"\u2734",
"\u2735",
"\u2736",
"\u2737",
"\u2738",
"\u2739",
"\u273A",
"\u273B",
"\u273C",
"\u273D",
"\u273E",
"\u273F",
"\u2740",
"\u2741",
"\u2742",
"\u2743",
"\u2744",
"\u2745",
"\u2746",
"\u2747",
"\u2748",
"\u2749",
"\u274A",
"\u274B",
"\u25CF",
"\u274D",
"\u25A0",
"\u274F",
"\u2750",
"\u2751",
"\u2752",
"\u25B2",
"\u25BC",
"\u25C6",
"\u2756",
"\u25D7",
"\u2758",
"\u2759",
"\u275A",
"\u275B",
"\u275C",
"\u275D",
"\u275E",
"\u007F",
"\uF8D7",
"\uF8D8",
"\uF8D9",
"\uF8DA",
"\uF8DB",
"\uF8DC",
"\uF8DD",
"\uF8DE",
"\uF8DF",
"\uF8E0",
"\uF8E1",
"\uF8E2",
"\uF8E3",
"\uF8E4",
"\u008E",
"\u008F",
"\u0090",
"\u0091",
"\u0092",
"\u0093",
"\u0094",
"\u0095",
"\u0096",
"\u0097",
"\u0098",
"\u0099",
"\u009A",
"\u009B",
"\u009C",
"\u009D",
"\u009E",
"\u009F",
"\u00A0",
"\u2761",
"\u2762",
"\u2763",
"\u2764",
"\u2765",
"\u2766",
"\u2767",
"\u2663",
"\u2666",
"\u2665",
"\u2660",
"\u2460",
"\u2461",
"\u2462",
"\u2463",
"\u2464",
"\u2465",
"\u2466",
"\u2467",
"\u2468",
"\u2469",
"\u2776",
"\u2777",
"\u2778",
"\u2779",
"\u277A",
"\u277B",
"\u277C",
"\u277D",
"\u277E",
"\u277F",
"\u2780",
"\u2781",
"\u2782",
"\u2783",
"\u2784",
"\u2785",
"\u2786",
"\u2787",
"\u2788",
"\u2789",
"\u278A",
"\u278B",
"\u278C",
"\u278D",
"\u278E",
"\u278F",
"\u2790",
"\u2791",
"\u2792",
"\u2793",
"\u2794",
"\u2192",
"\u2194",
"\u2195",
"\u2798",
"\u2799",
"\u279A",
"\u279B",
"\u279C",
"\u279D",
"\u279E",
"\u279F",
"\u27A0",
"\u27A1",
"\u27A2",
"\u27A3",
"\u27A4",
"\u27A5",
"\u27A6",
"\u27A7",
"\u27A8",
"\u27A9",
"\u27AA",
"\u27AB",
"\u27AC",
"\u27AD",
"\u27AE",
"\u27AF",
"\u00F0",
"\u27B1",
"\u27B2",
"\u27B3",
"\u27B4",
"\u27B5",
"\u27B6",
"\u27B7",
"\u27B8",
"\u27B9",
"\u27BA",
"\u27BB",
"\u27BC",
"\u27BD",
"\u27BE",
"\u00FF",
]
assert len(_zapfding_encoding) == 256

View File

@ -0,0 +1,86 @@
# Copyright (c) 2023, exiledkingcc
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from pypdf._crypt_providers._base import CryptBase, CryptIdentity
try:
from pypdf._crypt_providers._cryptography import (
CryptAES,
CryptRC4,
aes_cbc_decrypt,
aes_cbc_encrypt,
aes_ecb_decrypt,
aes_ecb_encrypt,
crypt_provider,
rc4_decrypt,
rc4_encrypt,
)
from pypdf._utils import Version
if Version(crypt_provider[1]) <= Version("3.0"):
# This is due to the backend parameter being required back then:
# https://cryptography.io/en/latest/changelog/#v3-1
raise ImportError("cryptography<=3.0 is not supported") # pragma: no cover
except ImportError:
try:
from pypdf._crypt_providers._pycryptodome import ( # type: ignore
CryptAES,
CryptRC4,
aes_cbc_decrypt,
aes_cbc_encrypt,
aes_ecb_decrypt,
aes_ecb_encrypt,
crypt_provider,
rc4_decrypt,
rc4_encrypt,
)
except ImportError:
from pypdf._crypt_providers._fallback import ( # type: ignore
CryptAES,
CryptRC4,
aes_cbc_decrypt,
aes_cbc_encrypt,
aes_ecb_decrypt,
aes_ecb_encrypt,
crypt_provider,
rc4_decrypt,
rc4_encrypt,
)
__all__ = [
"CryptAES",
"CryptBase",
"CryptIdentity",
"CryptRC4",
"aes_cbc_decrypt",
"aes_cbc_encrypt",
"aes_ecb_decrypt",
"aes_ecb_encrypt",
"crypt_provider",
"rc4_decrypt",
"rc4_encrypt",
]

View File

@ -0,0 +1,38 @@
# Copyright (c) 2023, exiledkingcc
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
class CryptBase:
def encrypt(self, data: bytes) -> bytes: # pragma: no cover
return data
def decrypt(self, data: bytes) -> bytes: # pragma: no cover
return data
class CryptIdentity(CryptBase):
pass

View File

@ -0,0 +1,118 @@
# Copyright (c) 2023, exiledkingcc
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import secrets
from cryptography import __version__
from cryptography.hazmat.primitives import padding
from cryptography.hazmat.primitives.ciphers.algorithms import AES
try:
# 43.0.0 - https://cryptography.io/en/latest/changelog/#v43-0-0
from cryptography.hazmat.decrepit.ciphers.algorithms import ARC4
except ImportError:
from cryptography.hazmat.primitives.ciphers.algorithms import ARC4
from cryptography.hazmat.primitives.ciphers.base import Cipher
from cryptography.hazmat.primitives.ciphers.modes import CBC, ECB
from pypdf._crypt_providers._base import CryptBase
crypt_provider = ("cryptography", __version__)
class CryptRC4(CryptBase):
def __init__(self, key: bytes) -> None:
self.cipher = Cipher(ARC4(key), mode=None)
def encrypt(self, data: bytes) -> bytes:
encryptor = self.cipher.encryptor()
return encryptor.update(data) + encryptor.finalize()
def decrypt(self, data: bytes) -> bytes:
decryptor = self.cipher.decryptor()
return decryptor.update(data) + decryptor.finalize()
class CryptAES(CryptBase):
def __init__(self, key: bytes) -> None:
self.alg = AES(key)
def encrypt(self, data: bytes) -> bytes:
iv = secrets.token_bytes(16)
pad = padding.PKCS7(128).padder()
data = pad.update(data) + pad.finalize()
cipher = Cipher(self.alg, CBC(iv))
encryptor = cipher.encryptor()
return iv + encryptor.update(data) + encryptor.finalize()
def decrypt(self, data: bytes) -> bytes:
iv = data[:16]
data = data[16:]
# for empty encrypted data
if not data:
return data
# just for robustness, it does not happen under normal circumstances
if len(data) % 16 != 0:
pad = padding.PKCS7(128).padder()
data = pad.update(data) + pad.finalize()
cipher = Cipher(self.alg, CBC(iv))
decryptor = cipher.decryptor()
d = decryptor.update(data) + decryptor.finalize()
return d[: -d[-1]]
def rc4_encrypt(key: bytes, data: bytes) -> bytes:
encryptor = Cipher(ARC4(key), mode=None).encryptor()
return encryptor.update(data) + encryptor.finalize()
def rc4_decrypt(key: bytes, data: bytes) -> bytes:
decryptor = Cipher(ARC4(key), mode=None).decryptor()
return decryptor.update(data) + decryptor.finalize()
def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
encryptor = Cipher(AES(key), mode=ECB()).encryptor()
return encryptor.update(data) + encryptor.finalize()
def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
decryptor = Cipher(AES(key), mode=ECB()).decryptor()
return decryptor.update(data) + decryptor.finalize()
def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
encryptor = Cipher(AES(key), mode=CBC(iv)).encryptor()
return encryptor.update(data) + encryptor.finalize()
def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
decryptor = Cipher(AES(key), mode=CBC(iv)).decryptor()
return decryptor.update(data) + decryptor.finalize()

View File

@ -0,0 +1,93 @@
# Copyright (c) 2023, exiledkingcc
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from pypdf._crypt_providers._base import CryptBase
from pypdf.errors import DependencyError
_DEPENDENCY_ERROR_STR = "cryptography>=3.1 is required for AES algorithm"
crypt_provider = ("local_crypt_fallback", "0.0.0")
class CryptRC4(CryptBase):
def __init__(self, key: bytes) -> None:
self.s = bytearray(range(256))
j = 0
for i in range(256):
j = (j + self.s[i] + key[i % len(key)]) % 256
self.s[i], self.s[j] = self.s[j], self.s[i]
def encrypt(self, data: bytes) -> bytes:
s = bytearray(self.s)
out = [0 for _ in range(len(data))]
i, j = 0, 0
for k in range(len(data)):
i = (i + 1) % 256
j = (j + s[i]) % 256
s[i], s[j] = s[j], s[i]
x = s[(s[i] + s[j]) % 256]
out[k] = data[k] ^ x
return bytes(out)
def decrypt(self, data: bytes) -> bytes:
return self.encrypt(data)
class CryptAES(CryptBase):
def __init__(self, key: bytes) -> None:
pass
def encrypt(self, data: bytes) -> bytes:
raise DependencyError(_DEPENDENCY_ERROR_STR)
def decrypt(self, data: bytes) -> bytes:
raise DependencyError(_DEPENDENCY_ERROR_STR)
def rc4_encrypt(key: bytes, data: bytes) -> bytes:
return CryptRC4(key).encrypt(data)
def rc4_decrypt(key: bytes, data: bytes) -> bytes:
return CryptRC4(key).decrypt(data)
def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
raise DependencyError(_DEPENDENCY_ERROR_STR)
def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
raise DependencyError(_DEPENDENCY_ERROR_STR)
def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
raise DependencyError(_DEPENDENCY_ERROR_STR)
def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
raise DependencyError(_DEPENDENCY_ERROR_STR)

View File

@ -0,0 +1,97 @@
# Copyright (c) 2023, exiledkingcc
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import secrets
from Crypto import __version__
from Crypto.Cipher import AES, ARC4
from Crypto.Util.Padding import pad
from pypdf._crypt_providers._base import CryptBase
crypt_provider = ("pycryptodome", __version__)
class CryptRC4(CryptBase):
def __init__(self, key: bytes) -> None:
self.key = key
def encrypt(self, data: bytes) -> bytes:
return ARC4.ARC4Cipher(self.key).encrypt(data)
def decrypt(self, data: bytes) -> bytes:
return ARC4.ARC4Cipher(self.key).decrypt(data)
class CryptAES(CryptBase):
def __init__(self, key: bytes) -> None:
self.key = key
def encrypt(self, data: bytes) -> bytes:
iv = secrets.token_bytes(16)
data = pad(data, 16)
aes = AES.new(self.key, AES.MODE_CBC, iv)
return iv + aes.encrypt(data)
def decrypt(self, data: bytes) -> bytes:
iv = data[:16]
data = data[16:]
# for empty encrypted data
if not data:
return data
# just for robustness, it does not happen under normal circumstances
if len(data) % 16 != 0:
data = pad(data, 16)
aes = AES.new(self.key, AES.MODE_CBC, iv)
d = aes.decrypt(data)
return d[: -d[-1]]
def rc4_encrypt(key: bytes, data: bytes) -> bytes:
return ARC4.ARC4Cipher(key).encrypt(data)
def rc4_decrypt(key: bytes, data: bytes) -> bytes:
return ARC4.ARC4Cipher(key).decrypt(data)
def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
return AES.new(key, AES.MODE_ECB).encrypt(data)
def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
return AES.new(key, AES.MODE_ECB).decrypt(data)
def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
return AES.new(key, AES.MODE_CBC, iv).encrypt(data)
def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
return AES.new(key, AES.MODE_CBC, iv).decrypt(data)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,42 @@
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from ._utils import (
deprecation_with_replacement,
)
class PdfMerger:
"""
Use :class:`PdfWriter` instead.
.. deprecated:: 5.0.0
"""
def __init__(self) -> None:
deprecation_with_replacement("PdfMerger", "PdfWriter", "5.0.0")

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,289 @@
"""
Page labels are shown by PDF viewers as "the page number".
A page has a numeric index, starting at 0. Additionally, the page
has a label. In the most simple case:
label = index + 1
However, the title page and the table of contents might have Roman numerals as
page labels. This makes things more complicated.
Example 1
---------
>>> reader.root_object["/PageLabels"]["/Nums"]
[0, IndirectObject(18, 0, 139929798197504),
8, IndirectObject(19, 0, 139929798197504)]
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
{'/S': '/r'}
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
{'/S': '/D'}
Example 2
---------
The following is a document with pages labeled
i, ii, iii, iv, 1, 2, 3, A-8, A-9, ...
1 0 obj
<< /Type /Catalog
/PageLabels << /Nums [
0 << /S /r >>
4 << /S /D >>
7 << /S /D
/P ( A- )
/St 8
>>
% A number tree containing
% three page label dictionaries
]
>>
...
>>
endobj
§12.4.2 PDF Specification 1.7 and 2.0
=====================================
Entries in a page label dictionary
----------------------------------
The /S key:
D Decimal Arabic numerals
R Uppercase Roman numerals
r Lowercase Roman numerals
A Uppercase letters (A to Z for the first 26 pages,
AA to ZZ for the next 26, and so on)
a Lowercase letters (a to z for the first 26 pages,
aa to zz for the next 26, and so on)
"""
from typing import Iterator, List, Optional, Tuple, cast
from ._protocols import PdfCommonDocProtocol
from ._utils import logger_warning
from .generic import (
ArrayObject,
DictionaryObject,
NullObject,
NumberObject,
is_null_or_none,
)
def number2uppercase_roman_numeral(num: int) -> str:
roman = [
(1000, "M"),
(900, "CM"),
(500, "D"),
(400, "CD"),
(100, "C"),
(90, "XC"),
(50, "L"),
(40, "XL"),
(10, "X"),
(9, "IX"),
(5, "V"),
(4, "IV"),
(1, "I"),
]
def roman_num(num: int) -> Iterator[str]:
for decimal, roman_repr in roman:
x, _ = divmod(num, decimal)
yield roman_repr * x
num -= decimal * x
if num <= 0:
break
return "".join(list(roman_num(num)))
def number2lowercase_roman_numeral(number: int) -> str:
return number2uppercase_roman_numeral(number).lower()
def number2uppercase_letter(number: int) -> str:
if number <= 0:
raise ValueError("Expecting a positive number")
alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
rep = ""
while number > 0:
remainder = number % 26
if remainder == 0:
remainder = 26
rep = alphabet[remainder - 1] + rep
# update
number -= remainder
number = number // 26
return rep
def number2lowercase_letter(number: int) -> str:
return number2uppercase_letter(number).lower()
def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str:
# [Nums] shall be an array of the form
# [ key_1 value_1 key_2 value_2 ... key_n value_n ]
# where each key_i is an integer and the corresponding
# value_i shall be the object associated with that key.
# The keys shall be sorted in numerical order,
# analogously to the arrangement of keys in a name tree
# as described in 7.9.6, "Name Trees."
nums = cast(ArrayObject, dictionary_object["/Nums"])
i = 0
value = None
start_index = 0
while i < len(nums):
start_index = nums[i]
value = nums[i + 1].get_object()
if i + 2 == len(nums):
break
if nums[i + 2] > index:
break
i += 2
m = {
None: lambda n: "",
"/D": lambda n: str(n),
"/R": number2uppercase_roman_numeral,
"/r": number2lowercase_roman_numeral,
"/A": number2uppercase_letter,
"/a": number2lowercase_letter,
}
# if /Nums array is not following the specification or if /Nums is empty
if not isinstance(value, dict):
return str(index + 1) # Fallback
start = value.get("/St", 1)
prefix = value.get("/P", "")
return prefix + m[value.get("/S")](index - start_index + start)
def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
"""
See 7.9.7 "Number Trees".
Args:
reader: The PdfReader
index: The index of the page
Returns:
The label of the page, e.g. "iv" or "4".
"""
root = cast(DictionaryObject, reader.root_object)
if "/PageLabels" not in root:
return str(index + 1) # Fallback
number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
if "/Nums" in number_tree:
return get_label_from_nums(number_tree, index)
if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject):
# number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]}
# Limit maximum depth.
level = 0
while level < 100:
kids = cast(List[DictionaryObject], number_tree["/Kids"])
for kid in kids:
# kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
limits = cast(List[int], kid["/Limits"])
if limits[0] <= index <= limits[1]:
if not is_null_or_none(kid.get("/Kids", None)):
# Recursive definition.
level += 1
if level == 100: # pragma: no cover
raise NotImplementedError(
"Too deep nesting is not supported."
)
number_tree = kid
# Exit the inner `for` loop and continue at the next level with the
# next iteration of the `while` loop.
break
return get_label_from_nums(kid, index)
else:
# When there are no kids, make sure to exit the `while` loop directly
# and continue with the fallback.
break
logger_warning(f"Could not reliably determine page label for {index}.", __name__)
return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree
def nums_insert(
key: NumberObject,
value: DictionaryObject,
nums: ArrayObject,
) -> None:
"""
Insert a key, value pair in a Nums array.
See 7.9.7 "Number Trees".
Args:
key: number key of the entry
value: value of the entry
nums: Nums array to modify
"""
if len(nums) % 2 != 0:
raise ValueError("A nums like array must have an even number of elements")
i = len(nums)
while i != 0 and key <= nums[i - 2]:
i = i - 2
if i < len(nums) and key == nums[i]:
nums[i + 1] = value
else:
nums.insert(i, key)
nums.insert(i + 1, value)
def nums_clear_range(
key: NumberObject,
page_index_to: int,
nums: ArrayObject,
) -> None:
"""
Remove all entries in a number tree in a range after an entry.
See 7.9.7 "Number Trees".
Args:
key: number key of the entry before the range
page_index_to: The page index of the upper limit of the range
nums: Nums array to modify
"""
if len(nums) % 2 != 0:
raise ValueError("A nums like array must have an even number of elements")
if page_index_to < key:
raise ValueError("page_index_to must be greater or equal than key")
i = nums.index(key) + 2
while i < len(nums) and nums[i] <= page_index_to:
nums.pop(i)
nums.pop(i)
def nums_next(
key: NumberObject,
nums: ArrayObject,
) -> Tuple[Optional[NumberObject], Optional[DictionaryObject]]:
"""
Return the (key, value) pair of the entry after the given one.
See 7.9.7 "Number Trees".
Args:
key: number key of the entry
nums: Nums array
"""
if len(nums) % 2 != 0:
raise ValueError("A nums like array must have an even number of elements")
i = nums.index(key) + 2
if i < len(nums):
return (nums[i], nums[i + 1])
else:
return (None, None)

View File

@ -0,0 +1,86 @@
"""Helpers for working with PDF types."""
from abc import abstractmethod
from pathlib import Path
from typing import IO, Any, Dict, List, Optional, Protocol, Tuple, Union
from ._utils import StrByteType, StreamType
class PdfObjectProtocol(Protocol):
indirect_reference: Any
def clone(
self,
pdf_dest: Any,
force_duplicate: bool = False,
ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
) -> Any:
... # pragma: no cover
def _reference_clone(self, clone: Any, pdf_dest: Any) -> Any:
... # pragma: no cover
def get_object(self) -> Optional["PdfObjectProtocol"]:
... # pragma: no cover
def hash_value(self) -> bytes:
... # pragma: no cover
def write_to_stream(
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
) -> None:
... # pragma: no cover
class XmpInformationProtocol(PdfObjectProtocol):
pass
class PdfCommonDocProtocol(Protocol):
@property
def pdf_header(self) -> str:
... # pragma: no cover
@property
def pages(self) -> List[Any]:
... # pragma: no cover
@property
def root_object(self) -> PdfObjectProtocol:
... # pragma: no cover
def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
... # pragma: no cover
@property
def strict(self) -> bool:
... # pragma: no cover
class PdfReaderProtocol(PdfCommonDocProtocol, Protocol):
@property
@abstractmethod
def xref(self) -> Dict[int, Dict[int, Any]]:
... # pragma: no cover
@property
@abstractmethod
def trailer(self) -> Dict[str, Any]:
... # pragma: no cover
class PdfWriterProtocol(PdfCommonDocProtocol, Protocol):
_objects: List[Any]
_id_translated: Dict[int, Dict[int, int]]
incremental: bool
_reader: Any # PdfReader
@abstractmethod
def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
... # pragma: no cover
@abstractmethod
def _add_object(self, obj: Any) -> Any:
... # pragma: no cover

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,248 @@
"""
Code related to text extraction.
Some parts are still in _page.py. In doubt, they will stay there.
"""
import math
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
CUSTOM_RTL_MIN: int = -1
CUSTOM_RTL_MAX: int = -1
CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
class OrientationNotFoundError(Exception):
pass
def set_custom_rtl(
_min: Union[str, int, None] = None,
_max: Union[str, int, None] = None,
specials: Union[str, List[int], None] = None,
) -> Tuple[int, int, List[int]]:
"""
Change the Right-To-Left and special characters custom parameters.
Args:
_min: The new minimum value for the range of custom characters that
will be written right to left.
If set to ``None``, the value will not be changed.
If set to an integer or string, it will be converted to its ASCII code.
The default value is -1, which sets no additional range to be converted.
_max: The new maximum value for the range of custom characters that will
be written right to left.
If set to ``None``, the value will not be changed.
If set to an integer or string, it will be converted to its ASCII code.
The default value is -1, which sets no additional range to be converted.
specials: The new list of special characters to be inserted in the
current insertion order.
If set to ``None``, the current value will not be changed.
If set to a string, it will be converted to a list of ASCII codes.
The default value is an empty list.
Returns:
A tuple containing the new values for ``CUSTOM_RTL_MIN``,
``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
"""
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
if isinstance(_min, int):
CUSTOM_RTL_MIN = _min
elif isinstance(_min, str):
CUSTOM_RTL_MIN = ord(_min)
if isinstance(_max, int):
CUSTOM_RTL_MAX = _max
elif isinstance(_max, str):
CUSTOM_RTL_MAX = ord(_max)
if isinstance(specials, str):
CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
elif isinstance(specials, list):
CUSTOM_RTL_SPECIAL_CHARS = specials
return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
def mult(m: List[float], n: List[float]) -> List[float]:
return [
m[0] * n[0] + m[1] * n[2],
m[0] * n[1] + m[1] * n[3],
m[2] * n[0] + m[3] * n[2],
m[2] * n[1] + m[3] * n[3],
m[4] * n[0] + m[5] * n[2] + n[4],
m[4] * n[1] + m[5] * n[3] + n[5],
]
def orient(m: List[float]) -> int:
if m[3] > 1e-6:
return 0
elif m[3] < -1e-6:
return 180
elif m[1] > 0:
return 90
else:
return 270
def crlf_space_check(
text: str,
cmtm_prev: Tuple[List[float], List[float]],
cmtm_matrix: Tuple[List[float], List[float]],
memo_cmtm: Tuple[List[float], List[float]],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
orientations: Tuple[int, ...],
output: str,
font_size: float,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
str_widths: float,
spacewidth: float,
str_height: float,
) -> Tuple[str, str, List[float], List[float]]:
cm_prev = cmtm_prev[0]
tm_prev = cmtm_prev[1]
cm_matrix = cmtm_matrix[0]
tm_matrix = cmtm_matrix[1]
memo_cm = memo_cmtm[0]
memo_tm = memo_cmtm[1]
m_prev = mult(tm_prev, cm_prev)
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
delta_x = m[4] - m_prev[4]
delta_y = m[5] - m_prev[5]
# Table 108 of the 1.7 reference ("Text positioning operators")
scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2)
cm_prev = m
if orientation not in orientations:
raise OrientationNotFoundError
if orientation in (0, 180):
moved_height: float = delta_y
moved_width: float = delta_x
elif orientation in (90, 270):
moved_height = delta_x
moved_width = delta_y
try:
if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y):
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
memo_cm,
memo_tm,
cmap[3],
font_size,
)
text = ""
elif (
(moved_width >= (spacewidth + str_widths) * scale_prev_x)
and (output + text)[-1] != " "
):
text += " "
except Exception:
pass
tm_prev = tm_matrix.copy()
cm_prev = cm_matrix.copy()
return text, output, cm_prev, tm_prev
def get_text_operands(
operands: List[Union[str, TextStringObject]],
cm_matrix: List[float],
tm_matrix: List[float],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
orientations: Tuple[int, ...]
) -> Tuple[str, bool]:
t: str = ""
is_str_operands = False
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
if orientation in orientations and len(operands) > 0:
if isinstance(operands[0], str):
t = operands[0]
is_str_operands = True
else:
t = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)
if isinstance(cmap[0], str):
try:
t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
except Exception:
# the data does not match the expectation,
# we use the alternative ;
# text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]
)
return (t, is_str_operands)
def get_display_str(
text: str,
cm_matrix: List[float],
tm_matrix: List[float],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
text_operands: str,
font_size: float,
rtl_dir: bool,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]]
) -> Tuple[str, bool]:
# "\u0590 - \u08FF \uFB50 - \uFDFF"
for x in [cmap[1].get(x, x) for x in text_operands]:
# x can be a sequence of bytes ; ex: habibi.pdf
if len(x) == 1:
xx = ord(x)
else:
xx = 1
# fmt: off
if (
# cases where the current inserting order is kept
(xx <= 0x2F) # punctuations but...
or 0x3A <= xx <= 0x40 # numbers (x30-39)
or 0x2000 <= xx <= 0x206F # upper punctuations..
or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
):
text = x + text if rtl_dir else text + x
elif ( # right-to-left characters set
0x0590 <= xx <= 0x08FF
or 0xFB1D <= xx <= 0xFDFF
or 0xFE70 <= xx <= 0xFEFF
or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
):
if not rtl_dir:
rtl_dir = True
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
text = x + text
else: # left-to-right
if rtl_dir:
rtl_dir = False
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
text = text + x
# fmt: on
return text, rtl_dir

View File

@ -0,0 +1,16 @@
"""Layout mode text extraction extension for pypdf"""
from ._fixed_width_page import (
fixed_char_width,
fixed_width_page,
text_show_operations,
y_coordinate_groups,
)
from ._font import Font
__all__ = [
"Font",
"fixed_char_width",
"fixed_width_page",
"text_show_operations",
"y_coordinate_groups",
]

View File

@ -0,0 +1,394 @@
"""Extract PDF text preserving the layout of the source PDF"""
from itertools import groupby
from math import ceil
from pathlib import Path
from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, TypedDict
from ..._utils import logger_warning
from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
from ._font import Font
from ._text_state_manager import TextStateManager
from ._text_state_params import TextStateParams
class BTGroup(TypedDict):
"""
Dict describing a line of text rendered within a BT/ET operator pair.
If multiple text show operations render text on the same line, the text
will be combined into a single BTGroup dict.
Keys:
tx: x coordinate of first character in BTGroup
ty: y coordinate of first character in BTGroup
font_size: nominal font size
font_height: effective font height
text: rendered text
displaced_tx: x coordinate of last character in BTGroup
flip_sort: -1 if page is upside down, else 1
"""
tx: float
ty: float
font_size: float
font_height: float
text: str
displaced_tx: float
flip_sort: Literal[-1, 1]
def bt_group(tj_op: TextStateParams, rendered_text: str, dispaced_tx: float) -> BTGroup:
"""
BTGroup constructed from a TextStateParams instance, rendered text, and
displaced tx value.
Args:
tj_op (TextStateParams): TextStateParams instance
rendered_text (str): rendered text
dispaced_tx (float): x coordinate of last character in BTGroup
"""
return BTGroup(
tx=tj_op.tx,
ty=tj_op.ty,
font_size=tj_op.font_size,
font_height=tj_op.font_height,
text=rendered_text,
displaced_tx=dispaced_tx,
flip_sort=-1 if tj_op.flip_vertical else 1,
)
def recurs_to_target_op(
ops: Iterator[Tuple[List[Any], bytes]],
text_state_mgr: TextStateManager,
end_target: Literal[b"Q", b"ET"],
fonts: Dict[str, Font],
strip_rotated: bool = True,
) -> Tuple[List[BTGroup], List[TextStateParams]]:
"""
Recurse operators between BT/ET and/or q/Q operators managing the transform
stack and capturing text positioning and rendering data.
Args:
ops: iterator of operators in content stream
text_state_mgr: a TextStateManager instance
end_target: Either b"Q" (ends b"q" op) or b"ET" (ends b"BT" op)
fonts: font dictionary as returned by PageObject._layout_mode_fonts()
Returns:
tuple: list of BTGroup dicts + list of TextStateParams dataclass instances.
"""
# 1 entry per line of text rendered within each BT/ET operation.
bt_groups: List[BTGroup] = []
# 1 entry per text show operator (Tj/TJ/'/")
tj_ops: List[TextStateParams] = []
if end_target == b"Q":
# add new q level. cm's added at this level will be popped at next b'Q'
text_state_mgr.add_q()
while True:
try:
operands, op = next(ops)
except StopIteration:
return bt_groups, tj_ops
if op == end_target:
if op == b"Q":
text_state_mgr.remove_q()
if op == b"ET":
if not tj_ops:
return bt_groups, tj_ops
_text = ""
bt_idx = 0 # idx of first tj in this bt group
last_displaced_tx = tj_ops[bt_idx].displaced_tx
last_ty = tj_ops[bt_idx].ty
for _idx, _tj in enumerate(
tj_ops
): # ... build text from new Tj operators
if strip_rotated and _tj.rotated:
continue
if not _tj.font.interpretable: # generates warning
continue
# if the y position of the text is greater than the font height, assume
# the text is on a new line and start a new group
if abs(_tj.ty - last_ty) > _tj.font_height:
if _text.strip():
bt_groups.append(
bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
)
bt_idx = _idx
_text = ""
# if the x position of the text is less than the last x position by
# more than 5 spaces widths, assume the text order should be flipped
# and start a new group
if (
last_displaced_tx - _tj.tx
> _tj.space_tx * LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
):
if _text.strip():
bt_groups.append(
bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
)
bt_idx = _idx
last_displaced_tx = _tj.displaced_tx
_text = ""
# calculate excess x translation based on ending tx of previous Tj.
# multiply by bool (_idx != bt_idx) to ensure spaces aren't double
# applied to the first tj of a BTGroup in fixed_width_page().
excess_tx = round(_tj.tx - last_displaced_tx, 3) * (_idx != bt_idx)
# space_tx could be 0 if either Tz or font_size was 0 for this _tj.
spaces = int(excess_tx // _tj.space_tx) if _tj.space_tx else 0
new_text = f'{" " * spaces}{_tj.txt}'
last_ty = _tj.ty
_text = f"{_text}{new_text}"
last_displaced_tx = _tj.displaced_tx
if _text:
bt_groups.append(bt_group(tj_ops[bt_idx], _text, last_displaced_tx))
text_state_mgr.reset_tm()
return bt_groups, tj_ops
if op == b"q":
bts, tjs = recurs_to_target_op(
ops, text_state_mgr, b"Q", fonts, strip_rotated
)
bt_groups.extend(bts)
tj_ops.extend(tjs)
elif op == b"cm":
text_state_mgr.add_cm(*operands)
elif op == b"BT":
bts, tjs = recurs_to_target_op(
ops, text_state_mgr, b"ET", fonts, strip_rotated
)
bt_groups.extend(bts)
tj_ops.extend(tjs)
elif op == b"Tj":
tj_ops.append(text_state_mgr.text_state_params(operands[0]))
elif op == b"TJ":
_tj = text_state_mgr.text_state_params()
for tj_op in operands[0]:
if isinstance(tj_op, bytes):
_tj = text_state_mgr.text_state_params(tj_op)
tj_ops.append(_tj)
else:
text_state_mgr.add_trm(_tj.displacement_matrix(TD_offset=tj_op))
elif op == b"'":
text_state_mgr.reset_trm()
text_state_mgr.add_tm([0, -text_state_mgr.TL])
tj_ops.append(text_state_mgr.text_state_params(operands[0]))
elif op == b'"':
text_state_mgr.reset_trm()
text_state_mgr.set_state_param(b"Tw", operands[0])
text_state_mgr.set_state_param(b"Tc", operands[1])
text_state_mgr.add_tm([0, -text_state_mgr.TL])
tj_ops.append(text_state_mgr.text_state_params(operands[2]))
elif op in (b"Td", b"Tm", b"TD", b"T*"):
text_state_mgr.reset_trm()
if op == b"Tm":
text_state_mgr.reset_tm()
elif op == b"TD":
text_state_mgr.set_state_param(b"TL", -operands[1])
elif op == b"T*":
operands = [0, -text_state_mgr.TL]
text_state_mgr.add_tm(operands)
elif op == b"Tf":
text_state_mgr.set_font(fonts[operands[0]], operands[1])
else: # handle Tc, Tw, Tz, TL, and Ts operators
text_state_mgr.set_state_param(op, operands)
def y_coordinate_groups(
bt_groups: List[BTGroup], debug_path: Optional[Path] = None
) -> Dict[int, List[BTGroup]]:
"""
Group text operations by rendered y coordinate, i.e. the line number.
Args:
bt_groups: list of dicts as returned by text_show_operations()
debug_path (Path, optional): Path to a directory for saving debug output.
Returns:
Dict[int, List[BTGroup]]: dict of lists of text rendered by each BT operator
keyed by y coordinate
"""
ty_groups = {
ty: sorted(grp, key=lambda x: x["tx"])
for ty, grp in groupby(
bt_groups, key=lambda bt_grp: int(bt_grp["ty"] * bt_grp["flip_sort"])
)
}
# combine groups whose y coordinates differ by less than the effective font height
# (accounts for mixed fonts and other minor oddities)
last_ty = next(iter(ty_groups))
last_txs = {int(_t["tx"]) for _t in ty_groups[last_ty] if _t["text"].strip()}
for ty in list(ty_groups)[1:]:
fsz = min(ty_groups[_y][0]["font_height"] for _y in (ty, last_ty))
txs = {int(_t["tx"]) for _t in ty_groups[ty] if _t["text"].strip()}
# prevent merge if both groups are rendering in the same x position.
no_text_overlap = not (txs & last_txs)
offset_less_than_font_height = abs(ty - last_ty) < fsz
if no_text_overlap and offset_less_than_font_height:
ty_groups[last_ty] = sorted(
ty_groups.pop(ty) + ty_groups[last_ty], key=lambda x: x["tx"]
)
last_txs |= txs
else:
last_ty = ty
last_txs = txs
if debug_path: # pragma: no cover
import json
debug_path.joinpath("bt_groups.json").write_text(
json.dumps(ty_groups, indent=2, default=str), "utf-8"
)
return ty_groups
def text_show_operations(
ops: Iterator[Tuple[List[Any], bytes]],
fonts: Dict[str, Font],
strip_rotated: bool = True,
debug_path: Optional[Path] = None,
) -> List[BTGroup]:
"""
Extract text from BT/ET operator pairs.
Args:
ops (Iterator[Tuple[List, bytes]]): iterator of operators in content stream
fonts (Dict[str, Font]): font dictionary
strip_rotated: Removes text if rotated w.r.t. to the page. Defaults to True.
debug_path (Path, optional): Path to a directory for saving debug output.
Returns:
List[BTGroup]: list of dicts of text rendered by each BT operator
"""
state_mgr = TextStateManager() # transformation stack manager
debug = bool(debug_path)
bt_groups: List[BTGroup] = [] # BT operator dict
tj_debug: List[TextStateParams] = [] # Tj/TJ operator data (debug only)
try:
warned_rotation = False
warned_uninterpretable_font = False
while True:
operands, op = next(ops)
if op in (b"BT", b"q"):
bts, tjs = recurs_to_target_op(
ops, state_mgr, b"ET" if op == b"BT" else b"Q", fonts, strip_rotated
)
if not warned_rotation and any(tj.rotated for tj in tjs):
warned_rotation = True
if strip_rotated:
logger_warning(
"Rotated text discovered. Output will be incomplete.",
__name__,
)
else:
logger_warning(
"Rotated text discovered. Layout will be degraded.",
__name__,
)
if not warned_uninterpretable_font and any(not tj.font.interpretable for tj in tjs):
warned_uninterpretable_font = True
logger_warning(
"PDF contains an uninterpretable font. Output will be incomplete.",
__name__,
)
bt_groups.extend(bts)
if debug: # pragma: no cover
tj_debug.extend(tjs)
elif op == b"Tf":
state_mgr.set_font(fonts[operands[0]], operands[1])
else: # set Tc, Tw, Tz, TL, and Ts if required. ignores all other ops
state_mgr.set_state_param(op, operands)
except StopIteration:
pass
# left align the data, i.e. decrement all tx values by min(tx)
min_x = min((x["tx"] for x in bt_groups), default=0.0)
bt_groups = [
dict(ogrp, tx=ogrp["tx"] - min_x, displaced_tx=ogrp["displaced_tx"] - min_x) # type: ignore[misc]
for ogrp in sorted(
bt_groups, key=lambda x: (x["ty"] * x["flip_sort"], -x["tx"]), reverse=True
)
]
if debug_path: # pragma: no cover
import json
debug_path.joinpath("bts.json").write_text(
json.dumps(bt_groups, indent=2, default=str), "utf-8"
)
debug_path.joinpath("tjs.json").write_text(
json.dumps(
tj_debug, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
),
"utf-8",
)
return bt_groups
def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> float:
"""
Calculate average character width weighted by the length of the rendered
text in each sample for conversion to fixed-width layout.
Args:
bt_groups (List[BTGroup]): List of dicts of text rendered by each
BT operator
Returns:
float: fixed character width
"""
char_widths = []
for _bt in bt_groups:
_len = len(_bt["text"]) * scale_weight
char_widths.append(((_bt["displaced_tx"] - _bt["tx"]) / _len, _len))
return sum(_w * _l for _w, _l in char_widths) / sum(_l for _, _l in char_widths)
def fixed_width_page(
ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float
) -> str:
"""
Generate page text from text operations grouped by rendered y coordinate.
Args:
ty_groups: dict of text show ops as returned by y_coordinate_groups()
char_width: fixed character width
space_vertically: include blank lines inferred from y distance + font height.
font_height_weight: multiplier for font height when calculating blank lines.
Returns:
str: page text in a fixed width format that closely adheres to the rendered
layout in the source pdf.
"""
lines: List[str] = []
last_y_coord = 0
for y_coord, line_data in ty_groups.items():
if space_vertically and lines:
fh = line_data[0]["font_height"]
blank_lines = 0 if fh == 0 else (
int(abs(y_coord - last_y_coord) / (fh * font_height_weight)) - 1
)
lines.extend([""] * blank_lines)
line = ""
last_disp = 0.0
for bt_op in line_data:
offset = int(bt_op["tx"] // char_width)
spaces = (offset - len(line)) * (ceil(last_disp) < int(bt_op["tx"]))
line = f"{line}{' ' * spaces}{bt_op['text']}"
last_disp = bt_op["displaced_tx"]
if line.strip() or lines:
lines.append(
"".join(c if ord(c) < 14 or ord(c) > 31 else " " for c in line)
)
last_y_coord = y_coord
return "\n".join(ln.rstrip() for ln in lines if space_vertically or ln.strip())

View File

@ -0,0 +1,152 @@
"""Font constants and classes for "layout" mode text operations"""
from dataclasses import dataclass, field
from typing import Any, Dict, Sequence, Union, cast
from ..._codecs import adobe_glyphs
from ...errors import ParseError
from ...generic import IndirectObject
from ._font_widths import STANDARD_WIDTHS
@dataclass
class Font:
"""
A font object formatted for use during "layout" mode text extraction
Attributes:
subtype (str): font subtype
space_width (int | float): width of a space character
encoding (str | Dict[int, str]): font encoding
char_map (dict): character map
font_dictionary (dict): font dictionary
width_map (Dict[str, int]): mapping of characters to widths
interpretable (bool): Default True. If False, the font glyphs cannot
be translated to characters, e.g. Type3 fonts that do not define
a '/ToUnicode' mapping.
"""
subtype: str
space_width: Union[int, float]
encoding: Union[str, Dict[int, str]]
char_map: Dict[Any, Any]
font_dictionary: Dict[Any, Any]
width_map: Dict[str, int] = field(default_factory=dict, init=False)
interpretable: bool = True
def __post_init__(self) -> None:
# Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
# reliably converted into character codes unless all named chars
# in /CharProcs map to a standard adobe glyph. See § 9.10.2 of the
# PDF 1.7 standard.
if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary:
self.interpretable = all(
cname in adobe_glyphs
for cname in self.font_dictionary.get("/CharProcs") or []
)
if not self.interpretable: # save some overhead if font is not interpretable
return
# TrueType fonts have a /Widths array mapping character codes to widths
if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:
first_char = self.font_dictionary.get("/FirstChar", 0)
self.width_map = {
self.encoding.get(idx + first_char, chr(idx + first_char)): width
for idx, width in enumerate(self.font_dictionary["/Widths"])
}
# CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts
if "/DescendantFonts" in self.font_dictionary:
d_font: Dict[Any, Any]
for d_font_idx, d_font in enumerate(
self.font_dictionary["/DescendantFonts"]
):
while isinstance(d_font, IndirectObject):
d_font = d_font.get_object()
self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font
ord_map = {
ord(_target): _surrogate
for _target, _surrogate in self.char_map.items()
if isinstance(_target, str)
}
# /W width definitions have two valid formats which can be mixed and matched:
# (1) A character start index followed by a list of widths, e.g.
# `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
# (2) A character start index, a character stop index, and a width, e.g.
# `45 65 500` applies width 500 to characters 45-65.
skip_count = 0
_w = d_font.get("/W", [])
for idx, w_entry in enumerate(_w):
w_entry = w_entry.get_object()
if skip_count:
skip_count -= 1
continue
if not isinstance(w_entry, (int, float)): # pragma: no cover
# We should never get here due to skip_count above. Add a
# warning and or use reader's "strict" to force an ex???
continue
# check for format (1): `int [int int int int ...]`
w_next_entry = _w[idx + 1].get_object()
if isinstance(w_next_entry, Sequence):
start_idx, width_list = w_entry, w_next_entry
self.width_map.update(
{
ord_map[_cidx]: _width
for _cidx, _width in zip(
range(
cast(int, start_idx),
cast(int, start_idx) + len(width_list),
1,
),
width_list,
)
if _cidx in ord_map
}
)
skip_count = 1
# check for format (2): `int int int`
elif isinstance(w_next_entry, (int, float)) and isinstance(
_w[idx + 2].get_object(), (int, float)
):
start_idx, stop_idx, const_width = (
w_entry,
w_next_entry,
_w[idx + 2].get_object(),
)
self.width_map.update(
{
ord_map[_cidx]: const_width
for _cidx in range(
cast(int, start_idx), cast(int, stop_idx + 1), 1
)
if _cidx in ord_map
}
)
skip_count = 2
else:
# Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions
# while expecting more elements). This raises an IndexError which is sufficient.
raise ParseError(
f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"
) # pragma: no cover
if not self.width_map and "/BaseFont" in self.font_dictionary:
for key in STANDARD_WIDTHS:
if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):
self.width_map = STANDARD_WIDTHS[key]
break
def word_width(self, word: str) -> float:
"""Sum of character widths specified in PDF font for the supplied word"""
return sum(
[self.width_map.get(char, self.space_width * 2) for char in word], 0.0
)
@staticmethod
def to_dict(font_instance: "Font") -> Dict[str, Any]:
"""Dataclass to dict for json.dumps serialization."""
return {
k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__
}

View File

@ -0,0 +1,208 @@
# Widths for the standard 14 fonts as described on page 416 of the PDF 1.7 standard
STANDARD_WIDTHS = {
"Helvetica": { # 4 fonts, includes bold, oblique and boldoblique variants
" ": 278,
"!": 278,
'"': 355,
"#": 556,
"$": 556,
"%": 889,
"&": 667,
"'": 191,
"(": 333,
")": 333,
"*": 389,
"+": 584,
",": 278,
"-": 333,
".": 278,
"/": 278,
"0": 556,
"1": 556,
"2": 556,
"3": 556,
"4": 556,
"5": 556,
"6": 556,
"7": 556,
"8": 556,
"9": 556,
":": 278,
";": 278,
"<": 584,
"=": 584,
">": 584,
"?": 611,
"@": 975,
"A": 667,
"B": 667,
"C": 722,
"D": 722,
"E": 667,
"F": 611,
"G": 778,
"H": 722,
"I": 278,
"J": 500,
"K": 667,
"L": 556,
"M": 833,
"N": 722,
"O": 778,
"P": 667,
"Q": 944,
"R": 667,
"S": 667,
"T": 611,
"U": 278,
"V": 278,
"W": 584,
"X": 556,
"Y": 556,
"Z": 500,
"[": 556,
"\\": 556,
"]": 556,
"^": 278,
"_": 278,
"`": 278,
"a": 278,
"b": 278,
"c": 333,
"d": 556,
"e": 556,
"f": 556,
"g": 556,
"h": 556,
"i": 556,
"j": 556,
"k": 556,
"l": 556,
"m": 556,
"n": 278,
"o": 278,
"p": 556,
"q": 556,
"r": 500,
"s": 556,
"t": 556,
"u": 278,
"v": 500,
"w": 500,
"x": 222,
"y": 222,
"z": 556,
"{": 222,
"|": 833,
"}": 556,
"~": 556,
},
"Times": { # 4 fonts, includes bold, oblique and boldoblique variants
" ": 250,
"!": 333,
'"': 408,
"#": 500,
"$": 500,
"%": 833,
"&": 778,
"'": 180,
"(": 333,
")": 333,
"*": 500,
"+": 564,
",": 250,
"-": 333,
".": 250,
"/": 564,
"0": 500,
"1": 500,
"2": 500,
"3": 500,
"4": 500,
"5": 500,
"6": 500,
"7": 500,
"8": 500,
"9": 500,
":": 278,
";": 278,
"<": 564,
"=": 564,
">": 564,
"?": 444,
"@": 921,
"A": 722,
"B": 667,
"C": 667,
"D": 722,
"E": 611,
"F": 556,
"G": 722,
"H": 722,
"I": 333,
"J": 389,
"K": 722,
"L": 611,
"M": 889,
"N": 722,
"O": 722,
"P": 556,
"Q": 722,
"R": 667,
"S": 556,
"T": 611,
"U": 722,
"V": 722,
"W": 944,
"X": 722,
"Y": 722,
"Z": 611,
"[": 333,
"\\": 278,
"]": 333,
"^": 469,
"_": 500,
"`": 333,
"a": 444,
"b": 500,
"c": 444,
"d": 500,
"e": 444,
"f": 333,
"g": 500,
"h": 500,
"i": 278,
"j": 278,
"k": 500,
"l": 278,
"m": 722,
"n": 500,
"o": 500,
"p": 500,
"q": 500,
"r": 333,
"s": 389,
"t": 278,
"u": 500,
"v": 444,
"w": 722,
"x": 500,
"y": 444,
"z": 389,
"{": 348,
"|": 220,
"}": 348,
"~": 469,
},
}
# 4 fonts, includes bold, oblique and bold oblique variants
STANDARD_WIDTHS[
"Courier"
] = dict.fromkeys(STANDARD_WIDTHS["Times"], 600) # fixed width
STANDARD_WIDTHS["ZapfDingbats"] = dict.fromkeys(STANDARD_WIDTHS["Times"], 1000) # 1 font
STANDARD_WIDTHS["Symbol"] = dict.fromkeys(STANDARD_WIDTHS["Times"], 500) # 1 font
# add aliases per table H.3 on page 1110 of the PDF 1.7 standard
STANDARD_WIDTHS["CourierNew"] = STANDARD_WIDTHS["Courier"]
STANDARD_WIDTHS["Arial"] = STANDARD_WIDTHS["Helvetica"]
STANDARD_WIDTHS["TimesNewRoman"] = STANDARD_WIDTHS["Times"]

View File

@ -0,0 +1,217 @@
"""manage the PDF transform stack during "layout" mode text extraction"""
from collections import ChainMap, Counter
from typing import Any, Dict, List, MutableMapping, Union
from typing import ChainMap as ChainMapType
from typing import Counter as CounterType
from ...errors import PdfReadError
from .. import mult
from ._font import Font
from ._text_state_params import TextStateParams
TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]]
TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]]
class TextStateManager:
"""
Tracks the current text state including cm/tm/trm transformation matrices.
Attributes:
transform_stack (ChainMap): ChainMap of cm/tm transformation matrices
q_queue (Counter[int]): Counter of q operators
q_depth (List[int]): list of q operator nesting levels
Tc (float): character spacing
Tw (float): word spacing
Tz (int): horizontal scaling
TL (float): leading
Ts (float): text rise
font (Font): font object
font_size (int | float): font size
"""
def __init__(self) -> None:
self.transform_stack: TextStateManagerChainMapType = ChainMap(
self.new_transform()
)
self.q_queue: CounterType[int] = Counter()
self.q_depth = [0]
self.Tc: float = 0.0
self.Tw: float = 0.0
self.Tz: float = 100.0
self.TL: float = 0.0
self.Ts: float = 0.0
self.font: Union[Font, None] = None
self.font_size: Union[int, float] = 0
def set_state_param(self, op: bytes, value: Union[float, List[Any]]) -> None:
"""
Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators.
Args:
op: operator read from PDF stream as bytes. No action is taken
for unsupported operators (see supported operators above).
value (float | List[Any]): new parameter value. If a list,
value[0] is used.
"""
if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]:
return
self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value)
def set_font(self, font: Font, size: float) -> None:
"""
Set the current font and font_size.
Args:
font (Font): a layout mode Font
size (float): font size
"""
self.font = font
self.font_size = size
def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams:
"""
Create a TextStateParams instance to display a text string. Type[bytes] values
will be decoded implicitly.
Args:
value (str | bytes): text to associate with the captured state.
Raises:
PdfReadError: if font not set (no Tf operator in incoming pdf content stream)
Returns:
TextStateParams: current text state parameters
"""
if not isinstance(self.font, Font):
raise PdfReadError(
"font not set: is PDF missing a Tf operator?"
) # pragma: no cover
if isinstance(value, bytes):
try:
if isinstance(self.font.encoding, str):
txt = value.decode(self.font.encoding, "surrogatepass")
else:
txt = "".join(
self.font.encoding[x]
if x in self.font.encoding
else bytes((x,)).decode()
for x in value
)
except (UnicodeEncodeError, UnicodeDecodeError):
txt = value.decode("utf-8", "replace")
txt = "".join(
self.font.char_map.get(x, x) for x in txt
)
else:
txt = value
return TextStateParams(
txt,
self.font,
self.font_size,
self.Tc,
self.Tw,
self.Tz,
self.TL,
self.Ts,
self.effective_transform,
)
@staticmethod
def raw_transform(
_a: float = 1.0,
_b: float = 0.0,
_c: float = 0.0,
_d: float = 1.0,
_e: float = 0.0,
_f: float = 0.0,
) -> Dict[int, float]:
"""Only a/b/c/d/e/f matrix params"""
return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f))))
@staticmethod
def new_transform(
_a: float = 1.0,
_b: float = 0.0,
_c: float = 0.0,
_d: float = 1.0,
_e: float = 0.0,
_f: float = 0.0,
is_text: bool = False,
is_render: bool = False,
) -> TextStateManagerDictType:
"""Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys"""
result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f)
result.update({"is_text": is_text, "is_render": is_render})
return result
def reset_tm(self) -> TextStateManagerChainMapType:
"""Clear all transforms from chainmap having is_text==True or is_render==True"""
while (
self.transform_stack.maps[0]["is_text"]
or self.transform_stack.maps[0]["is_render"]
):
self.transform_stack = self.transform_stack.parents
return self.transform_stack
def reset_trm(self) -> TextStateManagerChainMapType:
"""Clear all transforms from chainmap having is_render==True"""
while self.transform_stack.maps[0]["is_render"]:
self.transform_stack = self.transform_stack.parents
return self.transform_stack
def remove_q(self) -> TextStateManagerChainMapType:
"""Rewind to stack prior state after closing a 'q' with internal 'cm' ops"""
self.transform_stack = self.reset_tm()
self.transform_stack.maps = self.transform_stack.maps[
self.q_queue.pop(self.q_depth.pop(), 0) :
]
return self.transform_stack
def add_q(self) -> None:
"""Add another level to q_queue"""
self.q_depth.append(len(self.q_depth))
def add_cm(self, *args: Any) -> TextStateManagerChainMapType:
"""Concatenate an additional transform matrix"""
self.transform_stack = self.reset_tm()
self.q_queue.update(self.q_depth[-1:])
self.transform_stack = self.transform_stack.new_child(self.new_transform(*args))
return self.transform_stack
def _complete_matrix(self, operands: List[float]) -> List[float]:
"""Adds a, b, c, and d to an "e/f only" operand set (e.g Td)"""
if len(operands) == 2: # this is a Td operator or equivalent
operands = [1.0, 0.0, 0.0, 1.0, *operands]
return operands
def add_tm(self, operands: List[float]) -> TextStateManagerChainMapType:
"""Append a text transform matrix"""
self.transform_stack = self.transform_stack.new_child(
self.new_transform( # type: ignore[misc]
*self._complete_matrix(operands), is_text=True # type: ignore[arg-type]
)
)
return self.transform_stack
def add_trm(self, operands: List[float]) -> TextStateManagerChainMapType:
"""Append a text rendering transform matrix"""
self.transform_stack = self.transform_stack.new_child(
self.new_transform( # type: ignore[misc]
*self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type]
)
)
return self.transform_stack
@property
def effective_transform(self) -> List[float]:
"""Current effective transform accounting for cm, tm, and trm transforms"""
eff_transform = [*self.transform_stack.maps[0].values()]
for transform in self.transform_stack.maps[1:]:
eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5
return eff_transform

View File

@ -0,0 +1,129 @@
"""A dataclass that captures the CTM and Text State for a tj operation"""
import math
from dataclasses import dataclass, field
from typing import Any, Dict, List, Union
from .. import mult, orient
from ._font import Font
@dataclass
class TextStateParams:
"""
Text state parameters and operator values for a single text value in a
TJ or Tj PDF operation.
Attributes:
txt (str): the text to be rendered.
font (Font): font object
font_size (int | float): font size
Tc (float): character spacing. Defaults to 0.0.
Tw (float): word spacing. Defaults to 0.0.
Tz (float): horizontal scaling. Defaults to 100.0.
TL (float): leading, vertical displacement between text lines. Defaults to 0.0.
Ts (float): text rise. Used for super/subscripts. Defaults to 0.0.
transform (List[float]): effective transformation matrix.
tx (float): x cood of rendered text, i.e. self.transform[4]
ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts.
displaced_tx (float): x coord immediately following rendered text
space_tx (float): tx for a space character
font_height (float): effective font height accounting for CTM
flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.)
rotated (bool): True if the text orientation is rotated with respect to the page.
"""
txt: str
font: Font
font_size: Union[int, float]
Tc: float = 0.0
Tw: float = 0.0
Tz: float = 100.0
TL: float = 0.0
Ts: float = 0.0
transform: List[float] = field(
default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
)
tx: float = field(default=0.0, init=False)
ty: float = field(default=0.0, init=False)
displaced_tx: float = field(default=0.0, init=False)
space_tx: float = field(default=0.0, init=False)
font_height: float = field(default=0.0, init=False)
flip_vertical: bool = field(default=False, init=False)
rotated: bool = field(default=False, init=False)
def __post_init__(self) -> None:
if orient(self.transform) in (90, 270):
self.transform = mult(
[1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0],
self.transform,
)
self.rotated = True
# self.transform[0] AND self.transform[3] < 0 indicates true rotation.
# If only self.transform[3] < 0, the y coords are simply inverted.
if orient(self.transform) == 180 and self.transform[0] < -1e-6:
self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform)
self.rotated = True
self.displaced_tx = self.displaced_transform()[4]
self.tx = self.transform[4]
self.ty = self.render_transform()[5]
self.space_tx = round(self.word_tx(" "), 3)
if self.space_tx < 1e-6:
# if the " " char is assigned 0 width (e.g. for fine tuned spacing
# with TJ int operators a la crazyones.pdf), calculate space_tx as
# a TD_offset of -2 * font.space_width where font.space_width is
# the space_width calculated in _cmap.py.
self.space_tx = round(self.word_tx("", self.font.space_width * -2), 3)
self.font_height = self.font_size * math.sqrt(
self.transform[1] ** 2 + self.transform[3] ** 2
)
# flip_vertical handles PDFs generated by Microsoft Word's "publish" command.
self.flip_vertical = self.transform[3] < -1e-6 # inverts y axis
def font_size_matrix(self) -> List[float]:
"""Font size matrix"""
return [
self.font_size * (self.Tz / 100.0),
0.0,
0.0,
self.font_size,
0.0,
self.Ts,
]
def displaced_transform(self) -> List[float]:
"""Effective transform matrix after text has been rendered."""
return mult(self.displacement_matrix(), self.transform)
def render_transform(self) -> List[float]:
"""Effective transform matrix accounting for font size, Tz, and Ts."""
return mult(self.font_size_matrix(), self.transform)
def displacement_matrix(
self, word: Union[str, None] = None, TD_offset: float = 0.0
) -> List[float]:
"""
Text displacement matrix
Args:
word (str, optional): Defaults to None in which case self.txt displacement is
returned.
TD_offset (float, optional): translation applied by TD operator. Defaults to 0.0.
"""
word = word if word is not None else self.txt
return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, TD_offset), 0.0]
def word_tx(self, word: str, TD_offset: float = 0.0) -> float:
"""Horizontal text displacement for any word according this text state"""
return (
(self.font_size * ((self.font.word_width(word) - TD_offset) / 1000.0))
+ self.Tc
+ word.count(" ") * self.Tw
) * (self.Tz / 100.0)
@staticmethod
def to_dict(inst: "TextStateParams") -> Dict[str, Any]:
"""Dataclass to dict for json.dumps serialization"""
return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}

View File

@ -0,0 +1,605 @@
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""Utility functions for PDF library."""
__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
import functools
import logging
import re
import sys
import warnings
from dataclasses import dataclass
from datetime import datetime, timezone
from io import DEFAULT_BUFFER_SIZE
from os import SEEK_CUR
from typing import (
IO,
Any,
Dict,
List,
Optional,
Pattern,
Tuple,
Union,
overload,
)
if sys.version_info[:2] >= (3, 10):
# Python 3.10+: https://www.python.org/dev/peps/pep-0484/
from typing import TypeAlias
else:
from typing_extensions import TypeAlias
if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self
from .errors import (
STREAM_TRUNCATED_PREMATURELY,
DeprecationError,
PdfStreamError,
)
TransformationMatrixType: TypeAlias = Tuple[
Tuple[float, float, float], Tuple[float, float, float], Tuple[float, float, float]
]
CompressedTransformationMatrix: TypeAlias = Tuple[
float, float, float, float, float, float
]
StreamType = IO[Any]
StrByteType = Union[str, StreamType]
def parse_iso8824_date(text: Optional[str]) -> Optional[datetime]:
orgtext = text
if text is None:
return None
if text[0].isdigit():
text = "D:" + text
if text.endswith(("Z", "z")):
text += "0000"
text = text.replace("z", "+").replace("Z", "+").replace("'", "")
i = max(text.find("+"), text.find("-"))
if i > 0 and i != len(text) - 5:
text += "00"
for f in (
"D:%Y",
"D:%Y%m",
"D:%Y%m%d",
"D:%Y%m%d%H",
"D:%Y%m%d%H%M",
"D:%Y%m%d%H%M%S",
"D:%Y%m%d%H%M%S%z",
):
try:
d = datetime.strptime(text, f) # noqa: DTZ007
except ValueError:
continue
else:
if text.endswith("+0000"):
d = d.replace(tzinfo=timezone.utc)
return d
raise ValueError(f"Can not convert date: {orgtext}")
def _get_max_pdf_version_header(header1: str, header2: str) -> str:
versions = (
"%PDF-1.3",
"%PDF-1.4",
"%PDF-1.5",
"%PDF-1.6",
"%PDF-1.7",
"%PDF-2.0",
)
pdf_header_indices = []
if header1 in versions:
pdf_header_indices.append(versions.index(header1))
if header2 in versions:
pdf_header_indices.append(versions.index(header2))
if len(pdf_header_indices) == 0:
raise ValueError(f"Neither {header1!r} nor {header2!r} are proper headers")
return versions[max(pdf_header_indices)]
WHITESPACES = (b"\x00", b"\t", b"\n", b"\f", b"\r", b" ")
WHITESPACES_AS_BYTES = b"".join(WHITESPACES)
WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]"
def read_until_whitespace(stream: StreamType, maxchars: Optional[int] = None) -> bytes:
"""
Read non-whitespace characters and return them.
Stops upon encountering whitespace or when maxchars is reached.
Args:
stream: The data stream from which was read.
maxchars: The maximum number of bytes returned; by default unlimited.
Returns:
The data which was read.
"""
txt = b""
while True:
tok = stream.read(1)
if tok.isspace() or not tok:
break
txt += tok
if len(txt) == maxchars:
break
return txt
def read_non_whitespace(stream: StreamType) -> bytes:
"""
Find and read the next non-whitespace character (ignores whitespace).
Args:
stream: The data stream from which was read.
Returns:
The data which was read.
"""
tok = stream.read(1)
while tok in WHITESPACES:
tok = stream.read(1)
return tok
def skip_over_whitespace(stream: StreamType) -> bool:
"""
Similar to read_non_whitespace, but return a boolean if at least one
whitespace character was read.
Args:
stream: The data stream from which was read.
Returns:
True if one or more whitespace was skipped, otherwise return False.
"""
tok = stream.read(1)
cnt = 0
while tok in WHITESPACES:
cnt += 1
tok = stream.read(1)
return cnt > 0
def check_if_whitespace_only(value: bytes) -> bool:
"""
Check if the given value consists of whitespace characters only.
Args:
value: The bytes to check.
Returns:
True if the value only has whitespace characters, otherwise return False.
"""
return all(b in WHITESPACES_AS_BYTES for b in value)
def skip_over_comment(stream: StreamType) -> None:
tok = stream.read(1)
stream.seek(-1, 1)
if tok == b"%":
while tok not in (b"\n", b"\r"):
tok = stream.read(1)
if tok == b"":
raise PdfStreamError("File ended unexpectedly.")
def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:
"""
Read until the regular expression pattern matched (ignore the match).
Treats EOF on the underlying stream as the end of the token to be matched.
Args:
regex: re.Pattern
Returns:
The read bytes.
"""
name = b""
while True:
tok = stream.read(16)
if not tok:
return name
m = regex.search(name + tok)
if m is not None:
stream.seek(m.start() - (len(name) + len(tok)), 1)
name = (name + tok)[: m.start()]
break
name += tok
return name
def read_block_backwards(stream: StreamType, to_read: int) -> bytes:
"""
Given a stream at position X, read a block of size to_read ending at position X.
This changes the stream's position to the beginning of where the block was
read.
Args:
stream:
to_read:
Returns:
The data which was read.
"""
if stream.tell() < to_read:
raise PdfStreamError("Could not read malformed PDF file")
# Seek to the start of the block we want to read.
stream.seek(-to_read, SEEK_CUR)
read = stream.read(to_read)
# Seek to the start of the block we read after reading it.
stream.seek(-to_read, SEEK_CUR)
return read
def read_previous_line(stream: StreamType) -> bytes:
"""
Given a byte stream with current position X, return the previous line.
All characters between the first CR/LF byte found before X
(or, the start of the file, if no such byte is found) and position X
After this call, the stream will be positioned one byte after the
first non-CRLF character found beyond the first CR/LF byte before X,
or, if no such byte is found, at the beginning of the stream.
Args:
stream: StreamType:
Returns:
The data which was read.
"""
line_content = []
found_crlf = False
if stream.tell() == 0:
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
while True:
to_read = min(DEFAULT_BUFFER_SIZE, stream.tell())
if to_read == 0:
break
# Read the block. After this, our stream will be one
# beyond the initial position.
block = read_block_backwards(stream, to_read)
idx = len(block) - 1
if not found_crlf:
# We haven't found our first CR/LF yet.
# Read off characters until we hit one.
while idx >= 0 and block[idx] not in b"\r\n":
idx -= 1
if idx >= 0:
found_crlf = True
if found_crlf:
# We found our first CR/LF already (on this block or
# a previous one).
# Our combined line is the remainder of the block
# plus any previously read blocks.
line_content.append(block[idx + 1 :])
# Continue to read off any more CRLF characters.
while idx >= 0 and block[idx] in b"\r\n":
idx -= 1
else:
# Didn't find CR/LF yet - add this block to our
# previously read blocks and continue.
line_content.append(block)
if idx >= 0:
# We found the next non-CRLF character.
# Set the stream position correctly, then break
stream.seek(idx + 1, SEEK_CUR)
break
# Join all the blocks in the line (which are in reverse order)
return b"".join(line_content[::-1])
def matrix_multiply(
a: TransformationMatrixType, b: TransformationMatrixType
) -> TransformationMatrixType:
return tuple( # type: ignore[return-value]
tuple(sum(float(i) * float(j) for i, j in zip(row, col)) for col in zip(*b))
for row in a
)
def mark_location(stream: StreamType) -> None:
"""Create text file showing current location in context."""
# Mainly for debugging
radius = 5000
stream.seek(-radius, 1)
with open("pypdf_pdfLocation.txt", "wb") as output_fh:
output_fh.write(stream.read(radius))
output_fh.write(b"HERE")
output_fh.write(stream.read(radius))
stream.seek(-radius, 1)
@overload
def ord_(b: str) -> int:
...
@overload
def ord_(b: bytes) -> bytes:
...
@overload
def ord_(b: int) -> int:
...
def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:
if isinstance(b, str):
return ord(b)
return b
def deprecate(msg: str, stacklevel: int = 3) -> None:
warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel)
def deprecation(msg: str) -> None:
raise DeprecationError(msg)
def deprecate_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
"""Issue a warning that a feature will be removed, but has a replacement."""
deprecate(
f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.",
4,
)
def deprecation_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
"""Raise an exception that a feature was already removed, but has a replacement."""
deprecation(
f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead."
)
def deprecate_no_replacement(name: str, removed_in: str) -> None:
"""Issue a warning that a feature will be removed without replacement."""
deprecate(f"{name} is deprecated and will be removed in pypdf {removed_in}.", 4)
def deprecation_no_replacement(name: str, removed_in: str) -> None:
"""Raise an exception that a feature was already removed without replacement."""
deprecation(f"{name} is deprecated and was removed in pypdf {removed_in}.")
def logger_error(msg: str, src: str) -> None:
"""
Use this instead of logger.error directly.
That allows people to overwrite it more easily.
See the docs on when to use which:
https://pypdf.readthedocs.io/en/latest/user/suppress-warnings.html
"""
logging.getLogger(src).error(msg)
def logger_warning(msg: str, src: str) -> None:
"""
Use this instead of logger.warning directly.
That allows people to overwrite it more easily.
## Exception, warnings.warn, logger_warning
- Exceptions should be used if the user should write code that deals with
an error case, e.g. the PDF being completely broken.
- warnings.warn should be used if the user needs to fix their code, e.g.
DeprecationWarnings
- logger_warning should be used if the user needs to know that an issue was
handled by pypdf, e.g. a non-compliant PDF being read in a way that
pypdf could apply a robustness fix to still read it. This applies mainly
to strict=False mode.
"""
logging.getLogger(src).warning(msg)
def rename_kwargs(
func_name: str, kwargs: Dict[str, Any], aliases: Dict[str, str], fail: bool = False
) -> None:
"""
Helper function to deprecate arguments.
Args:
func_name: Name of the function to be deprecated
kwargs:
aliases:
fail:
"""
for old_term, new_term in aliases.items():
if old_term in kwargs:
if fail:
raise DeprecationError(
f"{old_term} is deprecated as an argument. Use {new_term} instead"
)
if new_term in kwargs:
raise TypeError(
f"{func_name} received both {old_term} and {new_term} as "
f"an argument. {old_term} is deprecated. "
f"Use {new_term} instead."
)
kwargs[new_term] = kwargs.pop(old_term)
warnings.warn(
message=(
f"{old_term} is deprecated as an argument. Use {new_term} instead"
),
category=DeprecationWarning,
)
def _human_readable_bytes(bytes: int) -> str:
if bytes < 10**3:
return f"{bytes} Byte"
elif bytes < 10**6:
return f"{bytes / 10**3:.1f} kB"
elif bytes < 10**9:
return f"{bytes / 10**6:.1f} MB"
else:
return f"{bytes / 10**9:.1f} GB"
# The following class has been copied from Django:
# https://github.com/django/django/blob/adae619426b6f50046b3daaa744db52989c9d6db/django/utils/functional.py#L51-L65
# It received some modifications to comply with our own coding standards.
#
# Original license:
#
# ---------------------------------------------------------------------------------
# Copyright (c) Django Software Foundation and individual contributors.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of Django nor the names of its contributors may be used
# to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# ---------------------------------------------------------------------------------
class classproperty: # noqa: N801
"""
Decorator that converts a method with a single cls argument into a property
that can be accessed directly from the class.
"""
def __init__(self, method=None) -> None: # type: ignore # noqa: ANN001
self.fget = method
def __get__(self, instance, cls=None) -> Any: # type: ignore # noqa: ANN001
return self.fget(cls)
def getter(self, method) -> Self: # type: ignore # noqa: ANN001
self.fget = method
return self
@dataclass
class File:
from .generic import IndirectObject
name: str = ""
"""
Filename as identified within the PDF file.
"""
data: bytes = b""
"""
Data as bytes.
"""
indirect_reference: Optional[IndirectObject] = None
"""
Reference to the object storing the stream.
"""
def __str__(self) -> str:
return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
def __repr__(self) -> str:
return self.__str__()[:-1] + f", hash: {hash(self.data)})"
@functools.total_ordering
class Version:
COMPONENT_PATTERN = re.compile(r"^(\d+)(.*)$")
def __init__(self, version_str: str) -> None:
self.version_str = version_str
self.components = self._parse_version(version_str)
def _parse_version(self, version_str: str) -> List[Tuple[int, str]]:
components = version_str.split(".")
parsed_components = []
for component in components:
match = Version.COMPONENT_PATTERN.match(component)
if not match:
parsed_components.append((0, component))
continue
integer_prefix = match.group(1)
suffix = match.group(2)
if integer_prefix is None:
integer_prefix = 0
parsed_components.append((int(integer_prefix), suffix))
return parsed_components
def __eq__(self, other: object) -> bool:
if not isinstance(other, Version):
return False
return self.components == other.components
def __lt__(self, other: Any) -> bool:
if not isinstance(other, Version):
raise ValueError(f"Version cannot be compared against {type(other)}")
for self_component, other_component in zip(self.components, other.components):
self_value, self_suffix = self_component
other_value, other_suffix = other_component
if self_value < other_value:
return True
elif self_value > other_value:
return False
if self_suffix < other_suffix:
return True
elif self_suffix > other_suffix:
return False
return len(self.components) < len(other.components)

View File

@ -0,0 +1 @@
__version__ = "5.4.0"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,379 @@
"""Code in here is only used by pypdf.filters._xobj_to_image"""
import sys
from io import BytesIO
from typing import Any, Dict, List, Literal, Tuple, Union, cast
from ._utils import check_if_whitespace_only, logger_warning
from .constants import ColorSpaces
from .constants import FilterTypes as FT
from .constants import ImageAttributes as IA
from .errors import EmptyImageDataError, PdfReadError
from .generic import (
ArrayObject,
DecodedStreamObject,
EncodedStreamObject,
IndirectObject,
NullObject,
TextStringObject,
)
if sys.version_info[:2] >= (3, 10):
from typing import TypeAlias
else:
from typing_extensions import TypeAlias
try:
from PIL import Image, UnidentifiedImageError # noqa: F401
except ImportError:
raise ImportError(
"pillow is required to do image extraction. "
"It can be installed via 'pip install pypdf[image]'"
)
mode_str_type: TypeAlias = Literal[
"", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK"
]
MAX_IMAGE_MODE_NESTING_DEPTH: int = 10
def _get_imagemode(
color_space: Union[str, List[Any], Any],
color_components: int,
prev_mode: mode_str_type,
depth: int = 0,
) -> Tuple[mode_str_type, bool]:
"""
Returns:
Image mode, not taking into account mask (transparency).
ColorInversion is required (like for some DeviceCMYK).
"""
if depth > MAX_IMAGE_MODE_NESTING_DEPTH:
raise PdfReadError(
"Color spaces nested too deeply. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH."
)
if isinstance(color_space, NullObject):
return "", False
if isinstance(color_space, str):
pass
elif not isinstance(color_space, list):
raise PdfReadError(
"Cannot interpret color space", color_space
) # pragma: no cover
elif color_space[0].startswith("/Cal"): # /CalRGB and /CalGray
color_space = "/Device" + color_space[0][4:]
elif color_space[0] == "/ICCBased":
icc_profile = color_space[1].get_object()
color_components = cast(int, icc_profile["/N"])
color_space = icc_profile.get("/Alternate", "")
elif color_space[0] == "/Indexed":
color_space = color_space[1].get_object()
mode, invert_color = _get_imagemode(
color_space, color_components, prev_mode, depth + 1
)
if mode in ("RGB", "CMYK"):
mode = "P"
return mode, invert_color
elif color_space[0] == "/Separation":
color_space = color_space[2]
if isinstance(color_space, IndirectObject):
color_space = color_space.get_object()
mode, invert_color = _get_imagemode(
color_space, color_components, prev_mode, depth + 1
)
return mode, True
elif color_space[0] == "/DeviceN":
original_color_space = color_space
color_components = len(color_space[1])
color_space = color_space[2]
if isinstance(color_space, IndirectObject): # pragma: no cover
color_space = color_space.get_object()
if color_space == "/DeviceCMYK" and color_components == 1:
if original_color_space[1][0] != "/Black":
logger_warning(
f"Color {original_color_space[1][0]} converted to Gray. Please share PDF with pypdf dev team",
__name__,
)
return "L", True
mode, invert_color = _get_imagemode(
color_space, color_components, prev_mode, depth + 1
)
return mode, invert_color
mode_map: Dict[str, mode_str_type] = {
"1bit": "1", # must be zeroth position: color_components may index the values
"/DeviceGray": "L", # must be first position: color_components may index the values
"palette": "P", # must be second position: color_components may index the values
"/DeviceRGB": "RGB", # must be third position: color_components may index the values
"/DeviceCMYK": "CMYK", # must be fourth position: color_components may index the values
"2bit": "2bits",
"4bit": "4bits",
}
mode = (
mode_map.get(color_space)
or list(mode_map.values())[color_components]
or prev_mode
)
return mode, mode == "CMYK"
def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
mask = (1 << bits) - 1
byte_buffer = bytearray(size[0] * size[1])
data_index = 0
bit = 8 - bits
for y in range(size[1]):
if bit != 8 - bits:
data_index += 1
bit = 8 - bits
for x in range(size[0]):
byte_buffer[x + y * size[0]] = (data[data_index] >> bit) & mask
bit -= bits
if bit < 0:
data_index += 1
bit = 8 - bits
return bytes(byte_buffer)
def _extended_image_frombytes(
mode: str, size: Tuple[int, int], data: bytes
) -> Image.Image:
try:
img = Image.frombytes(mode, size, data)
except ValueError as exc:
nb_pix = size[0] * size[1]
data_length = len(data)
if data_length == 0:
raise EmptyImageDataError(
"Data is 0 bytes, cannot process an image from empty data."
) from exc
if data_length % nb_pix != 0:
raise exc
k = nb_pix * len(mode) / data_length
data = b"".join(bytes((x,) * int(k)) for x in data)
img = Image.frombytes(mode, size, data)
return img
def _handle_flate(
size: Tuple[int, int],
data: bytes,
mode: mode_str_type,
color_space: str,
colors: int,
obj_as_text: str,
) -> Tuple[Image.Image, str, str, bool]:
"""
Process image encoded in flateEncode
Returns img, image_format, extension, color inversion
"""
extension = ".png" # mime_type = "image/png"
image_format = "PNG"
lookup: Any
base: Any
hival: Any
if isinstance(color_space, ArrayObject) and color_space[0] == "/Indexed":
color_space, base, hival, lookup = (value.get_object() for value in color_space)
if mode == "2bits":
mode = "P"
data = bits2byte(data, size, 2)
elif mode == "4bits":
mode = "P"
data = bits2byte(data, size, 4)
img = _extended_image_frombytes(mode, size, data)
if color_space == "/Indexed":
if isinstance(lookup, (EncodedStreamObject, DecodedStreamObject)):
lookup = lookup.get_data()
if isinstance(lookup, TextStringObject):
lookup = lookup.original_bytes
if isinstance(lookup, str):
lookup = lookup.encode()
try:
nb, conv, mode = { # type: ignore
"1": (0, "", ""),
"L": (1, "P", "L"),
"P": (0, "", ""),
"RGB": (3, "P", "RGB"),
"CMYK": (4, "P", "CMYK"),
}[_get_imagemode(base, 0, "")[0]]
except KeyError: # pragma: no cover
logger_warning(
f"Base {base} not coded please share the pdf file with pypdf dev team",
__name__,
)
lookup = None
else:
if img.mode == "1":
# Two values ("high" and "low").
expected_count = 2 * nb
actual_count = len(lookup)
if actual_count != expected_count:
if actual_count < expected_count:
logger_warning(
f"Not enough lookup values: Expected {expected_count}, got {actual_count}.",
__name__
)
lookup += bytes([0] * (expected_count - actual_count))
elif not check_if_whitespace_only(lookup[expected_count:]):
logger_warning(
f"Too many lookup values: Expected {expected_count}, got {actual_count}.",
__name__
)
lookup = lookup[:expected_count]
colors_arr = [lookup[:nb], lookup[nb:]]
arr = b"".join(
b"".join(
colors_arr[1 if img.getpixel((x, y)) > 127 else 0]
for x in range(img.size[0])
)
for y in range(img.size[1])
)
img = Image.frombytes(mode, img.size, arr)
else:
img = img.convert(conv)
if len(lookup) != (hival + 1) * nb:
logger_warning(f"Invalid Lookup Table in {obj_as_text}", __name__)
lookup = None
elif mode == "L":
# gray lookup does not work : it is converted to a similar RGB lookup
lookup = b"".join([bytes([b, b, b]) for b in lookup])
mode = "RGB"
# TODO : cf https://github.com/py-pdf/pypdf/pull/2039
# this is a work around until PIL is able to process CMYK images
elif mode == "CMYK":
_rgb = []
for _c, _m, _y, _k in (
lookup[n : n + 4] for n in range(0, 4 * (len(lookup) // 4), 4)
):
_r = int(255 * (1 - _c / 255) * (1 - _k / 255))
_g = int(255 * (1 - _m / 255) * (1 - _k / 255))
_b = int(255 * (1 - _y / 255) * (1 - _k / 255))
_rgb.append(bytes((_r, _g, _b)))
lookup = b"".join(_rgb)
mode = "RGB"
if lookup is not None:
img.putpalette(lookup, rawmode=mode)
img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB")
elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased":
# see Table 66 - Additional Entries Specific to an ICC Profile
# Stream Dictionary
mode2 = _get_imagemode(color_space, colors, mode)[0]
if mode != mode2:
img = Image.frombytes(mode2, size, data) # reloaded as mode may have change
if mode == "CMYK":
extension = ".tif"
image_format = "TIFF"
return img, image_format, extension, False
def _handle_jpx(
size: Tuple[int, int],
data: bytes,
mode: mode_str_type,
color_space: str,
colors: int,
) -> Tuple[Image.Image, str, str, bool]:
"""
Process image encoded in flateEncode
Returns img, image_format, extension, inversion
"""
extension = ".jp2" # mime_type = "image/x-jp2"
img1 = Image.open(BytesIO(data), formats=("JPEG2000",))
mode, invert_color = _get_imagemode(color_space, colors, mode)
if mode == "":
mode = cast(mode_str_type, img1.mode)
invert_color = mode in ("CMYK",)
if img1.mode == "RGBA" and mode == "RGB":
mode = "RGBA"
# we need to convert to the good mode
if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unordered) sets
# L and P are indexed modes which should not be changed.
img = img1
elif {img1.mode, mode} == {"RGBA", "CMYK"}:
# RGBA / CMYK are 4bytes encoding where
# the encoding should be corrected
img = Image.frombytes(mode, img1.size, img1.tobytes())
else: # pragma: no cover
img = img1.convert(mode)
# for CMYK conversion :
# https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop
# not implemented for the moment as I need to get properly the ICC
if img.mode == "CMYK":
img = img.convert("RGB")
image_format = "JPEG2000"
return img, image_format, extension, invert_color
def _apply_decode(
img: Image.Image,
x_object_obj: Dict[str, Any],
lfilters: FT,
color_space: Union[str, List[Any], Any],
invert_color: bool,
) -> Image.Image:
# CMYK image and other color spaces without decode
# requires reverting scale (cf p243,2§ last sentence)
decode = x_object_obj.get(
IA.DECODE,
([1.0, 0.0] * len(img.getbands()))
if (
(img.mode == "CMYK" and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE))
or (invert_color and img.mode == "L")
)
else None,
)
if (
isinstance(color_space, ArrayObject)
and color_space[0].get_object() == "/Indexed"
):
decode = None # decode is meaningless if Indexed
if (
isinstance(color_space, ArrayObject)
and color_space[0].get_object() == "/Separation"
):
decode = [1.0, 0.0] * len(img.getbands())
if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))):
lut: List[int] = []
for i in range(0, len(decode), 2):
dmin = decode[i]
dmax = decode[i + 1]
lut.extend(
round(255.0 * (j / 255.0 * (dmax - dmin) + dmin)) for j in range(256)
)
img = img.point(lut)
return img
def _get_mode_and_invert_color(
x_object_obj: Dict[str, Any], colors: int, color_space: Union[str, List[Any], Any]
) -> Tuple[mode_str_type, bool]:
if (
IA.COLOR_SPACE in x_object_obj
and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
):
# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
mode: mode_str_type = "RGB"
if x_object_obj.get("/BitsPerComponent", 8) < 8:
mode, invert_color = _get_imagemode(
f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, ""
)
else:
mode, invert_color = _get_imagemode(
color_space,
2
if (
colors == 1
and (
not isinstance(color_space, NullObject)
and "Gray" not in color_space
)
)
else colors,
"",
)
return mode, invert_color

View File

@ -0,0 +1,42 @@
"""
PDF specifies several annotation types which pypdf makes available here.
The names of the annotations and their attributes do not reflect the names in
the specification in all cases. For example, the PDF standard defines a
'Square' annotation that does not actually need to be square. For this reason,
pypdf calls it 'Rectangle'.
At their core, all annotation types are DictionaryObjects. That means if pypdf
does not implement a feature, users can easily extend the given functionality.
"""
from ._base import NO_FLAGS, AnnotationDictionary
from ._markup_annotations import (
Ellipse,
FreeText,
Highlight,
Line,
MarkupAnnotation,
Polygon,
PolyLine,
Rectangle,
Text,
)
from ._non_markup_annotations import Link, Popup
__all__ = [
"NO_FLAGS",
"AnnotationDictionary",
"Ellipse",
"FreeText",
"Highlight",
"Line",
"Link",
"MarkupAnnotation",
"PolyLine",
"Polygon",
"Popup",
"Rectangle",
"Text",
]

View File

@ -0,0 +1,27 @@
from abc import ABC
from ..constants import AnnotationFlag
from ..generic import NameObject, NumberObject
from ..generic._data_structures import DictionaryObject
class AnnotationDictionary(DictionaryObject, ABC):
def __init__(self) -> None:
from ..generic._base import NameObject
# /Rect should not be added here as Polygon and PolyLine can automatically set it
self[NameObject("/Type")] = NameObject("/Annot")
# The flags were NOT added to the constructor on purpose:
# We expect that most users don't want to change the default.
# If they do, they can use the property. The default is 0.
@property
def flags(self) -> AnnotationFlag:
return self.get(NameObject("/F"), AnnotationFlag(0))
@flags.setter
def flags(self, value: AnnotationFlag) -> None:
self[NameObject("/F")] = NumberObject(value)
NO_FLAGS = AnnotationFlag(0)

View File

@ -0,0 +1,315 @@
import sys
from abc import ABC
from typing import Any, List, Optional, Tuple, Union
from .._utils import deprecation_with_replacement
from ..constants import AnnotationFlag
from ..generic import ArrayObject, DictionaryObject
from ..generic._base import (
BooleanObject,
FloatObject,
NameObject,
NumberObject,
TextStringObject,
)
from ..generic._rectangle import RectangleObject
from ..generic._utils import hex_to_rgb
from ._base import NO_FLAGS, AnnotationDictionary
if sys.version_info[:2] >= (3, 10):
from typing import TypeAlias
else:
# PEP 613 introduced typing.TypeAlias with Python 3.10
# For older Python versions, the backport typing_extensions is necessary:
from typing_extensions import TypeAlias
Vertex: TypeAlias = Tuple[float, float]
def _get_bounding_rectangle(vertices: List[Vertex]) -> RectangleObject:
x_min, y_min = vertices[0][0], vertices[0][1]
x_max, y_max = vertices[0][0], vertices[0][1]
for x, y in vertices:
x_min = min(x_min, x)
y_min = min(y_min, y)
x_max = max(x_max, x)
y_max = max(y_max, y)
rect = RectangleObject((x_min, y_min, x_max, y_max))
return rect
class MarkupAnnotation(AnnotationDictionary, ABC):
"""
Base class for all markup annotations.
Args:
title_bar: Text to be displayed in the title bar of the annotation;
by convention this is the name of the author
"""
def __init__(self, *, title_bar: Optional[str] = None) -> None:
if title_bar is not None:
self[NameObject("/T")] = TextStringObject(title_bar)
class Text(MarkupAnnotation):
"""
A text annotation.
Args:
rect: array of four integers ``[xLL, yLL, xUR, yUR]``
specifying the clickable rectangular area
text: The text that is added to the document
open:
flags:
"""
def __init__(
self,
*,
rect: Union[RectangleObject, Tuple[float, float, float, float]],
text: str,
open: bool = False,
flags: int = NO_FLAGS,
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
self[NameObject("/Subtype")] = NameObject("/Text")
self[NameObject("/Rect")] = RectangleObject(rect)
self[NameObject("/Contents")] = TextStringObject(text)
self[NameObject("/Open")] = BooleanObject(open)
self[NameObject("/Flags")] = NumberObject(flags)
class FreeText(MarkupAnnotation):
"""A FreeText annotation"""
def __init__(
self,
*,
text: str,
rect: Union[RectangleObject, Tuple[float, float, float, float]],
font: str = "Helvetica",
bold: bool = False,
italic: bool = False,
font_size: str = "14pt",
font_color: str = "000000",
border_color: Optional[str] = "000000",
background_color: Optional[str] = "ffffff",
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
self[NameObject("/Subtype")] = NameObject("/FreeText")
self[NameObject("/Rect")] = RectangleObject(rect)
# Table 225 of the 1.7 reference ("CSS2 style attributes used in rich text strings")
font_str = "font: "
if italic:
font_str = f"{font_str}italic "
else:
font_str = f"{font_str}normal "
if bold:
font_str = f"{font_str}bold "
else:
font_str = f"{font_str}normal "
font_str = f"{font_str}{font_size} {font}"
font_str = f"{font_str};text-align:left;color:#{font_color}"
default_appearance_string = ""
if border_color:
for st in hex_to_rgb(border_color):
default_appearance_string = f"{default_appearance_string}{st} "
default_appearance_string = f"{default_appearance_string}rg"
self.update(
{
NameObject("/Subtype"): NameObject("/FreeText"),
NameObject("/Rect"): RectangleObject(rect),
NameObject("/Contents"): TextStringObject(text),
# font size color
NameObject("/DS"): TextStringObject(font_str),
NameObject("/DA"): TextStringObject(default_appearance_string),
}
)
if border_color is None:
# Border Style
self[NameObject("/BS")] = DictionaryObject(
{
# width of 0 means no border
NameObject("/W"): NumberObject(0)
}
)
if background_color is not None:
self[NameObject("/C")] = ArrayObject(
[FloatObject(n) for n in hex_to_rgb(background_color)]
)
class Line(MarkupAnnotation):
def __init__(
self,
p1: Vertex,
p2: Vertex,
rect: Union[RectangleObject, Tuple[float, float, float, float]],
text: str = "",
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
self.update(
{
NameObject("/Subtype"): NameObject("/Line"),
NameObject("/Rect"): RectangleObject(rect),
NameObject("/L"): ArrayObject(
[
FloatObject(p1[0]),
FloatObject(p1[1]),
FloatObject(p2[0]),
FloatObject(p2[1]),
]
),
NameObject("/LE"): ArrayObject(
[
NameObject("/None"),
NameObject("/None"),
]
),
NameObject("/IC"): ArrayObject(
[
FloatObject(0.5),
FloatObject(0.5),
FloatObject(0.5),
]
),
NameObject("/Contents"): TextStringObject(text),
}
)
class PolyLine(MarkupAnnotation):
def __init__(
self,
vertices: List[Vertex],
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
if len(vertices) == 0:
raise ValueError("A polygon needs at least 1 vertex with two coordinates")
coord_list = []
for x, y in vertices:
coord_list.append(NumberObject(x))
coord_list.append(NumberObject(y))
self.update(
{
NameObject("/Subtype"): NameObject("/PolyLine"),
NameObject("/Vertices"): ArrayObject(coord_list),
NameObject("/Rect"): RectangleObject(_get_bounding_rectangle(vertices)),
}
)
class Rectangle(MarkupAnnotation):
def __init__(
self,
rect: Union[RectangleObject, Tuple[float, float, float, float]],
*,
interior_color: Optional[str] = None,
**kwargs: Any,
) -> None:
if "interiour_color" in kwargs:
deprecation_with_replacement("interiour_color", "interior_color", "5.0.0")
interior_color = kwargs["interiour_color"]
del kwargs["interiour_color"]
super().__init__(**kwargs)
self.update(
{
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Square"),
NameObject("/Rect"): RectangleObject(rect),
}
)
if interior_color:
self[NameObject("/IC")] = ArrayObject(
[FloatObject(n) for n in hex_to_rgb(interior_color)]
)
class Highlight(MarkupAnnotation):
def __init__(
self,
*,
rect: Union[RectangleObject, Tuple[float, float, float, float]],
quad_points: ArrayObject,
highlight_color: str = "ff0000",
printing: bool = False,
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
self.update(
{
NameObject("/Subtype"): NameObject("/Highlight"),
NameObject("/Rect"): RectangleObject(rect),
NameObject("/QuadPoints"): quad_points,
NameObject("/C"): ArrayObject(
[FloatObject(n) for n in hex_to_rgb(highlight_color)]
),
}
)
if printing:
self.flags = AnnotationFlag.PRINT
class Ellipse(MarkupAnnotation):
def __init__(
self,
rect: Union[RectangleObject, Tuple[float, float, float, float]],
*,
interior_color: Optional[str] = None,
**kwargs: Any,
) -> None:
if "interiour_color" in kwargs:
deprecation_with_replacement("interiour_color", "interior_color", "5.0.0")
interior_color = kwargs["interiour_color"]
del kwargs["interiour_color"]
super().__init__(**kwargs)
self.update(
{
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Circle"),
NameObject("/Rect"): RectangleObject(rect),
}
)
if interior_color:
self[NameObject("/IC")] = ArrayObject(
[FloatObject(n) for n in hex_to_rgb(interior_color)]
)
class Polygon(MarkupAnnotation):
def __init__(
self,
vertices: List[Tuple[float, float]],
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
if len(vertices) == 0:
raise ValueError("A polygon needs at least 1 vertex with two coordinates")
coord_list = []
for x, y in vertices:
coord_list.append(NumberObject(x))
coord_list.append(NumberObject(y))
self.update(
{
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Polygon"),
NameObject("/Vertices"): ArrayObject(coord_list),
NameObject("/IT"): NameObject("/PolygonCloud"),
NameObject("/Rect"): RectangleObject(_get_bounding_rectangle(vertices)),
}
)

View File

@ -0,0 +1,106 @@
from typing import TYPE_CHECKING, Any, Optional, Tuple, Union
from ..generic._base import (
BooleanObject,
NameObject,
NumberObject,
TextStringObject,
)
from ..generic._data_structures import ArrayObject, DictionaryObject
from ..generic._fit import DEFAULT_FIT, Fit
from ..generic._rectangle import RectangleObject
from ._base import AnnotationDictionary
class Link(AnnotationDictionary):
def __init__(
self,
*,
rect: Union[RectangleObject, Tuple[float, float, float, float]],
border: Optional[ArrayObject] = None,
url: Optional[str] = None,
target_page_index: Optional[int] = None,
fit: Fit = DEFAULT_FIT,
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
if TYPE_CHECKING:
from ..types import BorderArrayType
is_external = url is not None
is_internal = target_page_index is not None
if not is_external and not is_internal:
raise ValueError(
"Either 'url' or 'target_page_index' have to be provided. Both were None."
)
if is_external and is_internal:
raise ValueError(
"Either 'url' or 'target_page_index' have to be provided. "
f"{url=}, {target_page_index=}"
)
border_arr: BorderArrayType
if border is not None:
border_arr = [NumberObject(n) for n in border[:3]]
if len(border) == 4:
dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])
border_arr.append(dash_pattern)
else:
border_arr = [NumberObject(0)] * 3
self.update(
{
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Link"),
NameObject("/Rect"): RectangleObject(rect),
NameObject("/Border"): ArrayObject(border_arr),
}
)
if is_external:
self[NameObject("/A")] = DictionaryObject(
{
NameObject("/S"): NameObject("/URI"),
NameObject("/Type"): NameObject("/Action"),
NameObject("/URI"): TextStringObject(url),
}
)
if is_internal:
# This needs to be updated later!
dest_deferred = DictionaryObject(
{
"target_page_index": NumberObject(target_page_index),
"fit": NameObject(fit.fit_type),
"fit_args": fit.fit_args,
}
)
self[NameObject("/Dest")] = dest_deferred
class Popup(AnnotationDictionary):
def __init__(
self,
*,
rect: Union[RectangleObject, Tuple[float, float, float, float]],
parent: Optional[DictionaryObject] = None,
open: bool = False,
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
self.update(
{
NameObject("/Subtype"): NameObject("/Popup"),
NameObject("/Rect"): RectangleObject(rect),
NameObject("/Open"): BooleanObject(open),
}
)
if parent:
# This needs to be an indirect object
try:
self[NameObject("/Parent")] = parent.indirect_reference
except AttributeError:
from .._utils import logger_warning
logger_warning(
"Unregistered Parent object : No Parent field set",
__name__,
)

View File

@ -0,0 +1,722 @@
"""Various constants, enums, and flags to aid readability."""
from enum import Enum, IntFlag, auto, unique
from typing import Dict, Tuple
class StrEnum(str, Enum): # Once we are on Python 3.11+: enum.StrEnum
def __str__(self) -> str:
return str(self.value)
class Core:
"""Keywords that don't quite belong anywhere else."""
OUTLINES = "/Outlines"
THREADS = "/Threads"
PAGE = "/Page"
PAGES = "/Pages"
CATALOG = "/Catalog"
class TrailerKeys:
ROOT = "/Root"
ENCRYPT = "/Encrypt"
ID = "/ID"
INFO = "/Info"
SIZE = "/Size"
PREV = "/Prev"
class CatalogAttributes:
NAMES = "/Names"
DESTS = "/Dests"
class EncryptionDictAttributes:
"""
Additional encryption dictionary entries for the standard security handler.
Table 3.19, Page 122.
Table 21 of the 2.0 manual.
"""
R = "/R" # number, required; revision of the standard security handler
O = "/O" # 32-byte string, required # noqa: E741
U = "/U" # 32-byte string, required
P = "/P" # integer flag, required; permitted operations
ENCRYPT_METADATA = "/EncryptMetadata" # boolean flag, optional
class UserAccessPermissions(IntFlag):
"""
Table 3.20 User access permissions.
Table 22 of the 2.0 manual.
"""
R1 = 1
R2 = 2
PRINT = 4
MODIFY = 8
EXTRACT = 16
ADD_OR_MODIFY = 32
R7 = 64
R8 = 128
FILL_FORM_FIELDS = 256
EXTRACT_TEXT_AND_GRAPHICS = 512
ASSEMBLE_DOC = 1024
PRINT_TO_REPRESENTATION = 2048
R13 = 2**12
R14 = 2**13
R15 = 2**14
R16 = 2**15
R17 = 2**16
R18 = 2**17
R19 = 2**18
R20 = 2**19
R21 = 2**20
R22 = 2**21
R23 = 2**22
R24 = 2**23
R25 = 2**24
R26 = 2**25
R27 = 2**26
R28 = 2**27
R29 = 2**28
R30 = 2**29
R31 = 2**30
R32 = 2**31
@classmethod
def _is_reserved(cls, name: str) -> bool:
"""Check if the given name corresponds to a reserved flag entry."""
return name.startswith("R") and name[1:].isdigit()
@classmethod
def _is_active(cls, name: str) -> bool:
"""Check if the given reserved name defaults to 1 = active."""
return name not in {"R1", "R2"}
def to_dict(self) -> Dict[str, bool]:
"""Convert the given flag value to a corresponding verbose name mapping."""
result: Dict[str, bool] = {}
for name, flag in UserAccessPermissions.__members__.items():
if UserAccessPermissions._is_reserved(name):
continue
result[name.lower()] = (self & flag) == flag
return result
@classmethod
def from_dict(cls, value: Dict[str, bool]) -> "UserAccessPermissions":
"""Convert the verbose name mapping to the corresponding flag value."""
value_copy = value.copy()
result = cls(0)
for name, flag in cls.__members__.items():
if cls._is_reserved(name):
# Reserved names have a required value. Use it.
if cls._is_active(name):
result |= flag
continue
is_active = value_copy.pop(name.lower(), False)
if is_active:
result |= flag
if value_copy:
raise ValueError(f"Unknown dictionary keys: {value_copy!r}")
return result
@classmethod
def all(cls) -> "UserAccessPermissions":
return cls((2**32 - 1) - cls.R1 - cls.R2)
class Resources:
"""
Table 3.30 Entries in a resource dictionary.
Table 34 in the 2.0 reference.
"""
EXT_G_STATE = "/ExtGState" # dictionary, optional
COLOR_SPACE = "/ColorSpace" # dictionary, optional
PATTERN = "/Pattern" # dictionary, optional
SHADING = "/Shading" # dictionary, optional
XOBJECT = "/XObject" # dictionary, optional
FONT = "/Font" # dictionary, optional
PROC_SET = "/ProcSet" # array, optional
PROPERTIES = "/Properties" # dictionary, optional
class Ressources: # deprecated
"""
Use :class: `Resources` instead.
.. deprecated:: 5.0.0
"""
class PagesAttributes:
"""§7.7.3.2 of the 1.7 and 2.0 reference."""
TYPE = "/Type" # name, required; must be /Pages
PARENT = "/Parent" # dictionary, required; indirect reference to pages object
KIDS = "/Kids" # array, required; List of indirect references
COUNT = "/Count"
# integer, required; the number of leaf nodes (page objects)
# that are descendants of this node within the page tree
class PageAttributes:
"""§7.7.3.3 of the 1.7 and 2.0 reference."""
TYPE = "/Type" # name, required; must be /Page
PARENT = "/Parent" # dictionary, required; a pages object
LAST_MODIFIED = (
"/LastModified" # date, optional; date and time of last modification
)
RESOURCES = "/Resources" # dictionary, required if there are any
MEDIABOX = "/MediaBox" # rectangle, required; rectangle specifying page size
CROPBOX = "/CropBox" # rectangle, optional
BLEEDBOX = "/BleedBox" # rectangle, optional
TRIMBOX = "/TrimBox" # rectangle, optional
ARTBOX = "/ArtBox" # rectangle, optional
BOX_COLOR_INFO = "/BoxColorInfo" # dictionary, optional
CONTENTS = "/Contents" # stream or array, optional
ROTATE = "/Rotate" # integer, optional; page rotation in degrees
GROUP = "/Group" # dictionary, optional; page group
THUMB = "/Thumb" # stream, optional; indirect reference to image of the page
B = "/B" # array, optional
DUR = "/Dur" # number, optional
TRANS = "/Trans" # dictionary, optional
ANNOTS = "/Annots" # array, optional; an array of annotations
AA = "/AA" # dictionary, optional
METADATA = "/Metadata" # stream, optional
PIECE_INFO = "/PieceInfo" # dictionary, optional
STRUCT_PARENTS = "/StructParents" # integer, optional
ID = "/ID" # byte string, optional
PZ = "/PZ" # number, optional
SEPARATION_INFO = "/SeparationInfo" # dictionary, optional
TABS = "/Tabs" # name, optional
TEMPLATE_INSTANTIATED = "/TemplateInstantiated" # name, optional
PRES_STEPS = "/PresSteps" # dictionary, optional
USER_UNIT = "/UserUnit" # number, optional
VP = "/VP" # dictionary, optional
AF = "/AF" # array of dictionaries, optional
OUTPUT_INTENTS = "/OutputIntents" # array, optional
D_PART = "/DPart" # dictionary, required, if this page is within the range of a DPart, not permitted otherwise
class FileSpecificationDictionaryEntries:
"""Table 3.41 Entries in a file specification dictionary."""
Type = "/Type"
FS = "/FS" # The name of the file system to be used to interpret this file specification
F = "/F" # A file specification string of the form described in §3.10.1
UF = "/UF" # A Unicode string of the file as described in §3.10.1
DOS = "/DOS"
Mac = "/Mac"
Unix = "/Unix"
ID = "/ID"
V = "/V"
EF = "/EF" # dictionary, containing a subset of the keys F, UF, DOS, Mac, and Unix
RF = "/RF" # dictionary, containing arrays of /EmbeddedFile
DESC = "/Desc" # description of the file
Cl = "/Cl"
class StreamAttributes:
"""
Table 4.2.
Table 5 in the 2.0 reference.
"""
LENGTH = "/Length" # integer, required
FILTER = "/Filter" # name or array of names, optional
DECODE_PARMS = "/DecodeParms" # variable, optional -- 'decodeParams is wrong
@unique
class FilterTypes(StrEnum):
"""§7.4 of the 1.7 and 2.0 references."""
ASCII_HEX_DECODE = "/ASCIIHexDecode" # abbreviation: AHx
ASCII_85_DECODE = "/ASCII85Decode" # abbreviation: A85
LZW_DECODE = "/LZWDecode" # abbreviation: LZW
FLATE_DECODE = "/FlateDecode" # abbreviation: Fl, PDF 1.2
RUN_LENGTH_DECODE = "/RunLengthDecode" # abbreviation: RL
CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF
DCT_DECODE = "/DCTDecode" # abbreviation: DCT
JPX_DECODE = "/JPXDecode"
class FilterTypeAbbreviations:
"""§8.9.7 of the 1.7 and 2.0 references."""
AHx = "/AHx"
A85 = "/A85"
LZW = "/LZW"
FL = "/Fl" # FlateDecode
RL = "/RL"
CCF = "/CCF"
DCT = "/DCT"
class LzwFilterParameters:
"""
Table 4.4.
Table 8 in the 2.0 reference.
"""
PREDICTOR = "/Predictor" # integer
COLORS = "/Colors" # integer
BITS_PER_COMPONENT = "/BitsPerComponent" # integer
COLUMNS = "/Columns" # integer
EARLY_CHANGE = "/EarlyChange" # integer
class CcittFaxDecodeParameters:
"""
Table 4.5.
Table 11 in the 2.0 reference.
"""
K = "/K" # integer
END_OF_LINE = "/EndOfLine" # boolean
ENCODED_BYTE_ALIGN = "/EncodedByteAlign" # boolean
COLUMNS = "/Columns" # integer
ROWS = "/Rows" # integer
END_OF_BLOCK = "/EndOfBlock" # boolean
BLACK_IS_1 = "/BlackIs1" # boolean
DAMAGED_ROWS_BEFORE_ERROR = "/DamagedRowsBeforeError" # integer
class ImageAttributes:
"""§11.6.5 of the 1.7 and 2.0 references."""
TYPE = "/Type" # name, required; must be /XObject
SUBTYPE = "/Subtype" # name, required; must be /Image
NAME = "/Name" # name, required
WIDTH = "/Width" # integer, required
HEIGHT = "/Height" # integer, required
BITS_PER_COMPONENT = "/BitsPerComponent" # integer, required
COLOR_SPACE = "/ColorSpace" # name, required
DECODE = "/Decode" # array, optional
INTENT = "/Intent" # string, optional
INTERPOLATE = "/Interpolate" # boolean, optional
IMAGE_MASK = "/ImageMask" # boolean, optional
MASK = "/Mask" # 1-bit image mask stream
S_MASK = "/SMask" # dictionary or name, optional
class ColorSpaces:
DEVICE_RGB = "/DeviceRGB"
DEVICE_CMYK = "/DeviceCMYK"
DEVICE_GRAY = "/DeviceGray"
class TypArguments:
"""Table 8.2 of the PDF 1.7 reference."""
LEFT = "/Left"
RIGHT = "/Right"
BOTTOM = "/Bottom"
TOP = "/Top"
class TypFitArguments:
"""Table 8.2 of the PDF 1.7 reference."""
FIT = "/Fit"
FIT_V = "/FitV"
FIT_BV = "/FitBV"
FIT_B = "/FitB"
FIT_H = "/FitH"
FIT_BH = "/FitBH"
FIT_R = "/FitR"
XYZ = "/XYZ"
class GoToActionArguments:
S = "/S" # name, required: type of action
D = "/D" # name / byte string /array, required: Destination to jump to
class AnnotationDictionaryAttributes:
"""Table 8.15 Entries common to all annotation dictionaries."""
Type = "/Type"
Subtype = "/Subtype"
Rect = "/Rect"
Contents = "/Contents"
P = "/P"
NM = "/NM"
M = "/M"
F = "/F"
AP = "/AP"
AS = "/AS"
DA = "/DA"
Border = "/Border"
C = "/C"
StructParent = "/StructParent"
OC = "/OC"
class InteractiveFormDictEntries:
Fields = "/Fields"
NeedAppearances = "/NeedAppearances"
SigFlags = "/SigFlags"
CO = "/CO"
DR = "/DR"
DA = "/DA"
Q = "/Q"
XFA = "/XFA"
class FieldDictionaryAttributes:
"""
Entries common to all field dictionaries (Table 8.69 PDF 1.7 reference)
(*very partially documented here*).
FFBits provides the constants used for `/Ff` from Table 8.70/8.75/8.77/8.79
"""
FT = "/FT" # name, required for terminal fields
Parent = "/Parent" # dictionary, required for children
Kids = "/Kids" # array, sometimes required
T = "/T" # text string, optional
TU = "/TU" # text string, optional
TM = "/TM" # text string, optional
Ff = "/Ff" # integer, optional
V = "/V" # text string or array, optional
DV = "/DV" # text string, optional
AA = "/AA" # dictionary, optional
Opt = "/Opt" # array, optional
class FfBits(IntFlag):
"""
Ease building /Ff flags
Some entries may be specific to:
* Text (Tx) (Table 8.75 PDF 1.7 reference)
* Buttons (Btn) (Table 8.77 PDF 1.7 reference)
* Choice (Ch) (Table 8.79 PDF 1.7 reference)
"""
ReadOnly = 1 << 0
"""common to Tx/Btn/Ch in Table 8.70"""
Required = 1 << 1
"""common to Tx/Btn/Ch in Table 8.70"""
NoExport = 1 << 2
"""common to Tx/Btn/Ch in Table 8.70"""
Multiline = 1 << 12
"""Tx"""
Password = 1 << 13
"""Tx"""
NoToggleToOff = 1 << 14
"""Btn"""
Radio = 1 << 15
"""Btn"""
Pushbutton = 1 << 16
"""Btn"""
Combo = 1 << 17
"""Ch"""
Edit = 1 << 18
"""Ch"""
Sort = 1 << 19
"""Ch"""
FileSelect = 1 << 20
"""Tx"""
MultiSelect = 1 << 21
"""Tx"""
DoNotSpellCheck = 1 << 22
"""Tx/Ch"""
DoNotScroll = 1 << 23
"""Tx"""
Comb = 1 << 24
"""Tx"""
RadiosInUnison = 1 << 25
"""Btn"""
RichText = 1 << 25
"""Tx"""
CommitOnSelChange = 1 << 26
"""Ch"""
@classmethod
def attributes(cls) -> Tuple[str, ...]:
"""
Get a tuple of all the attributes present in a Field Dictionary.
This method returns a tuple of all the attribute constants defined in
the FieldDictionaryAttributes class. These attributes correspond to the
entries that are common to all field dictionaries as specified in the
PDF 1.7 reference.
Returns:
A tuple containing all the attribute constants.
"""
return (
cls.TM,
cls.T,
cls.FT,
cls.Parent,
cls.TU,
cls.Ff,
cls.V,
cls.DV,
cls.Kids,
cls.AA,
)
@classmethod
def attributes_dict(cls) -> Dict[str, str]:
"""
Get a dictionary of attribute keys and their human-readable names.
This method returns a dictionary where the keys are the attribute
constants defined in the FieldDictionaryAttributes class and the values
are their corresponding human-readable names. These attributes
correspond to the entries that are common to all field dictionaries as
specified in the PDF 1.7 reference.
Returns:
A dictionary containing attribute keys and their names.
"""
return {
cls.FT: "Field Type",
cls.Parent: "Parent",
cls.T: "Field Name",
cls.TU: "Alternate Field Name",
cls.TM: "Mapping Name",
cls.Ff: "Field Flags",
cls.V: "Value",
cls.DV: "Default Value",
}
class CheckboxRadioButtonAttributes:
"""Table 8.76 Field flags common to all field types."""
Opt = "/Opt" # Options, Optional
@classmethod
def attributes(cls) -> Tuple[str, ...]:
"""
Get a tuple of all the attributes present in a Field Dictionary.
This method returns a tuple of all the attribute constants defined in
the CheckboxRadioButtonAttributes class. These attributes correspond to
the entries that are common to all field dictionaries as specified in
the PDF 1.7 reference.
Returns:
A tuple containing all the attribute constants.
"""
return (cls.Opt,)
@classmethod
def attributes_dict(cls) -> Dict[str, str]:
"""
Get a dictionary of attribute keys and their human-readable names.
This method returns a dictionary where the keys are the attribute
constants defined in the CheckboxRadioButtonAttributes class and the
values are their corresponding human-readable names. These attributes
correspond to the entries that are common to all field dictionaries as
specified in the PDF 1.7 reference.
Returns:
A dictionary containing attribute keys and their names.
"""
return {
cls.Opt: "Options",
}
class FieldFlag(IntFlag):
"""Table 8.70 Field flags common to all field types."""
READ_ONLY = 1
REQUIRED = 2
NO_EXPORT = 4
class DocumentInformationAttributes:
"""Table 10.2 Entries in the document information dictionary."""
TITLE = "/Title" # text string, optional
AUTHOR = "/Author" # text string, optional
SUBJECT = "/Subject" # text string, optional
KEYWORDS = "/Keywords" # text string, optional
CREATOR = "/Creator" # text string, optional
PRODUCER = "/Producer" # text string, optional
CREATION_DATE = "/CreationDate" # date, optional
MOD_DATE = "/ModDate" # date, optional
TRAPPED = "/Trapped" # name, optional
class PageLayouts:
"""
Page 84, PDF 1.4 reference.
Page 115, PDF 2.0 reference.
"""
SINGLE_PAGE = "/SinglePage"
ONE_COLUMN = "/OneColumn"
TWO_COLUMN_LEFT = "/TwoColumnLeft"
TWO_COLUMN_RIGHT = "/TwoColumnRight"
TWO_PAGE_LEFT = "/TwoPageLeft" # (PDF 1.5)
TWO_PAGE_RIGHT = "/TwoPageRight" # (PDF 1.5)
class GraphicsStateParameters:
"""Table 58 Entries in a Graphics State Parameter Dictionary"""
TYPE = "/Type" # name, optional
LW = "/LW" # number, optional
LC = "/LC" # integer, optional
LJ = "/LJ" # integer, optional
ML = "/ML" # number, optional
D = "/D" # array, optional
RI = "/RI" # name, optional
OP = "/OP"
op = "/op"
OPM = "/OPM"
FONT = "/Font" # array, optional
BG = "/BG"
BG2 = "/BG2"
UCR = "/UCR"
UCR2 = "/UCR2"
TR = "/TR"
TR2 = "/TR2"
HT = "/HT"
FL = "/FL"
SM = "/SM"
SA = "/SA"
BM = "/BM"
S_MASK = "/SMask" # dictionary or name, optional
CA = "/CA"
ca = "/ca"
AIS = "/AIS"
TK = "/TK"
class CatalogDictionary:
"""§7.7.2 of the 1.7 and 2.0 references."""
TYPE = "/Type" # name, required; must be /Catalog
VERSION = "/Version" # name
EXTENSIONS = "/Extensions" # dictionary, optional; ISO 32000-1
PAGES = "/Pages" # dictionary, required
PAGE_LABELS = "/PageLabels" # number tree, optional
NAMES = "/Names" # dictionary, optional
DESTS = "/Dests" # dictionary, optional
VIEWER_PREFERENCES = "/ViewerPreferences" # dictionary, optional
PAGE_LAYOUT = "/PageLayout" # name, optional
PAGE_MODE = "/PageMode" # name, optional
OUTLINES = "/Outlines" # dictionary, optional
THREADS = "/Threads" # array, optional
OPEN_ACTION = "/OpenAction" # array or dictionary or name, optional
AA = "/AA" # dictionary, optional
URI = "/URI" # dictionary, optional
ACRO_FORM = "/AcroForm" # dictionary, optional
METADATA = "/Metadata" # stream, optional
STRUCT_TREE_ROOT = "/StructTreeRoot" # dictionary, optional
MARK_INFO = "/MarkInfo" # dictionary, optional
LANG = "/Lang" # text string, optional
SPIDER_INFO = "/SpiderInfo" # dictionary, optional
OUTPUT_INTENTS = "/OutputIntents" # array, optional
PIECE_INFO = "/PieceInfo" # dictionary, optional
OC_PROPERTIES = "/OCProperties" # dictionary, optional
PERMS = "/Perms" # dictionary, optional
LEGAL = "/Legal" # dictionary, optional
REQUIREMENTS = "/Requirements" # array, optional
COLLECTION = "/Collection" # dictionary, optional
NEEDS_RENDERING = "/NeedsRendering" # boolean, optional
DSS = "/DSS" # dictionary, optional
AF = "/AF" # array of dictionaries, optional
D_PART_ROOT = "/DPartRoot" # dictionary, optional
class OutlineFontFlag(IntFlag):
"""A class used as an enumerable flag for formatting an outline font."""
italic = 1
bold = 2
class PageLabelStyle:
"""
Table 8.10 in the 1.7 reference.
Table 161 in the 2.0 reference.
"""
DECIMAL = "/D" # Decimal Arabic numerals
UPPERCASE_ROMAN = "/R" # Uppercase Roman numerals
LOWERCASE_ROMAN = "/r" # Lowercase Roman numerals
UPPERCASE_LETTER = "/A" # Uppercase letters
LOWERCASE_LETTER = "/a" # Lowercase letters
class AnnotationFlag(IntFlag):
"""See §12.5.3 "Annotation Flags"."""
INVISIBLE = 1
HIDDEN = 2
PRINT = 4
NO_ZOOM = 8
NO_ROTATE = 16
NO_VIEW = 32
READ_ONLY = 64
LOCKED = 128
TOGGLE_NO_VIEW = 256
LOCKED_CONTENTS = 512
PDF_KEYS = (
AnnotationDictionaryAttributes,
CatalogAttributes,
CatalogDictionary,
CcittFaxDecodeParameters,
CheckboxRadioButtonAttributes,
ColorSpaces,
Core,
DocumentInformationAttributes,
EncryptionDictAttributes,
FieldDictionaryAttributes,
FileSpecificationDictionaryEntries,
FilterTypeAbbreviations,
FilterTypes,
GoToActionArguments,
GraphicsStateParameters,
ImageAttributes,
InteractiveFormDictEntries,
LzwFilterParameters,
PageAttributes,
PageLayouts,
PagesAttributes,
Resources,
StreamAttributes,
TrailerKeys,
TypArguments,
TypFitArguments,
)
class ImageType(IntFlag):
NONE = 0
XOBJECT_IMAGES = auto()
INLINE_IMAGES = auto()
DRAWING_IMAGES = auto()
ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
IMAGES = ALL # for consistency with ObjectDeletionFlag

View File

@ -0,0 +1,66 @@
"""
All errors/exceptions pypdf raises and all of the warnings it uses.
Please note that broken PDF files might cause other Exceptions.
"""
class DeprecationError(Exception):
"""Raised when a deprecated feature is used."""
class DependencyError(Exception):
"""
Raised when a required dependency (a library or module that pypdf depends on)
is not available or cannot be imported.
"""
class PyPdfError(Exception):
"""Base class for all exceptions raised by pypdf."""
class PdfReadError(PyPdfError):
"""Raised when there is an issue reading a PDF file."""
class PageSizeNotDefinedError(PyPdfError):
"""Raised when the page size of a PDF document is not defined."""
class PdfReadWarning(UserWarning):
"""Issued when there is a potential issue reading a PDF file, but it can still be read."""
class PdfStreamError(PdfReadError):
"""Raised when there is an issue reading the stream of data in a PDF file."""
class ParseError(PyPdfError):
"""
Raised when there is an issue parsing (analyzing and understanding the
structure and meaning of) a PDF file.
"""
class FileNotDecryptedError(PdfReadError):
"""
Raised when a PDF file that has been encrypted
(meaning it requires a password to be accessed) has not been successfully
decrypted.
"""
class WrongPasswordError(FileNotDecryptedError):
"""Raised when the wrong password is used to try to decrypt an encrypted PDF file."""
class EmptyFileError(PdfReadError):
"""Raised when a PDF file is empty or has no content."""
class EmptyImageDataError(PyPdfError):
"""Raised when trying to process an image that has no data."""
STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"

View File

@ -0,0 +1,836 @@
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""
Implementation of stream filters for PDF.
See TABLE H.1 Abbreviations for standard filter names
"""
__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
import math
import struct
import zlib
from base64 import a85decode
from dataclasses import dataclass
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from ._codecs._codecs import LzwCodec as _LzwCodec
from ._utils import (
WHITESPACES_AS_BYTES,
deprecate,
deprecate_with_replacement,
deprecation_no_replacement,
logger_warning,
)
from .constants import CcittFaxDecodeParameters as CCITT
from .constants import FilterTypeAbbreviations as FTA
from .constants import FilterTypes as FT
from .constants import ImageAttributes as IA
from .constants import LzwFilterParameters as LZW
from .constants import StreamAttributes as SA
from .errors import DeprecationError, PdfReadError, PdfStreamError
from .generic import (
ArrayObject,
DictionaryObject,
IndirectObject,
NullObject,
)
def decompress(data: bytes) -> bytes:
"""
Decompress the given data using zlib.
Attempts to decompress the input data using zlib.
If the decompression fails due to a zlib error, it falls back
to using a decompression object with a larger window size.
Args:
data: The input data to be decompressed.
Returns:
The decompressed data.
"""
try:
return zlib.decompress(data)
except zlib.error:
try:
# For larger files, use decompression object to enable buffered reading
return zlib.decompressobj().decompress(data)
except zlib.error:
# If still failing, then try with increased window size
d = zlib.decompressobj(zlib.MAX_WBITS | 32)
result_str = b""
for b in [data[i : i + 1] for i in range(len(data))]:
try:
result_str += d.decompress(b)
except zlib.error:
pass
return result_str
class FlateDecode:
@staticmethod
def decode(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
"""
Decode data which is flate-encoded.
Args:
data: flate-encoded data.
decode_parms: a dictionary of values, understanding the
"/Predictor":<int> key only
Returns:
The flate-decoded data.
Raises:
PdfReadError:
"""
if isinstance(decode_parms, ArrayObject):
raise DeprecationError("decode_parms as ArrayObject is deprecated")
str_data = decompress(data)
predictor = 1
if decode_parms:
try:
predictor = decode_parms.get("/Predictor", 1)
except (AttributeError, TypeError): # Type Error is NullObject
pass # Usually an array with a null object was read
# predictor 1 == no predictor
if predictor != 1:
# /Columns, the number of samples in each row, has a default value of 1;
# §7.4.4.3, ISO 32000.
DEFAULT_BITS_PER_COMPONENT = 8
try:
columns = cast(int, decode_parms[LZW.COLUMNS].get_object()) # type: ignore
except (TypeError, KeyError):
columns = 1
try:
colors = cast(int, decode_parms[LZW.COLORS].get_object()) # type: ignore
except (TypeError, KeyError):
colors = 1
try:
bits_per_component = cast(
int,
decode_parms[LZW.BITS_PER_COMPONENT].get_object(), # type: ignore
)
except (TypeError, KeyError):
bits_per_component = DEFAULT_BITS_PER_COMPONENT
# PNG predictor can vary by row and so is the lead byte on each row
rowlength = (
math.ceil(columns * colors * bits_per_component / 8) + 1
) # number of bytes
# TIFF prediction:
if predictor == 2:
rowlength -= 1 # remove the predictor byte
bpp = rowlength // columns
str_data = bytearray(str_data)
for i in range(len(str_data)):
if i % rowlength >= bpp:
str_data[i] = (str_data[i] + str_data[i - bpp]) % 256
str_data = bytes(str_data)
# PNG prediction:
elif 10 <= predictor <= 15:
str_data = FlateDecode._decode_png_prediction(
str_data, columns, rowlength
)
else:
raise PdfReadError(f"Unsupported flatedecode predictor {predictor!r}")
return str_data
@staticmethod
def _decode_png_prediction(data: bytes, columns: int, rowlength: int) -> bytes:
# PNG prediction can vary from row to row
if len(data) % rowlength != 0:
raise PdfReadError("Image data is not rectangular")
output = []
prev_rowdata = (0,) * rowlength
bpp = (rowlength - 1) // columns # recomputed locally to not change params
for row in range(0, len(data), rowlength):
rowdata: List[int] = list(data[row : row + rowlength])
filter_byte = rowdata[0]
if filter_byte == 0:
# PNG None Predictor
pass
elif filter_byte == 1:
# PNG Sub Predictor
for i in range(bpp + 1, rowlength):
rowdata[i] = (rowdata[i] + rowdata[i - bpp]) % 256
elif filter_byte == 2:
# PNG Up Predictor
for i in range(1, rowlength):
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
elif filter_byte == 3:
# PNG Average Predictor
for i in range(1, bpp + 1):
floor = prev_rowdata[i] // 2
rowdata[i] = (rowdata[i] + floor) % 256
for i in range(bpp + 1, rowlength):
left = rowdata[i - bpp]
floor = (left + prev_rowdata[i]) // 2
rowdata[i] = (rowdata[i] + floor) % 256
elif filter_byte == 4:
# PNG Paeth Predictor
for i in range(1, bpp + 1):
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
for i in range(bpp + 1, rowlength):
left = rowdata[i - bpp]
up = prev_rowdata[i]
up_left = prev_rowdata[i - bpp]
p = left + up - up_left
dist_left = abs(p - left)
dist_up = abs(p - up)
dist_up_left = abs(p - up_left)
if dist_left <= dist_up and dist_left <= dist_up_left:
paeth = left
elif dist_up <= dist_up_left:
paeth = up
else:
paeth = up_left
rowdata[i] = (rowdata[i] + paeth) % 256
else:
raise PdfReadError(
f"Unsupported PNG filter {filter_byte!r}"
) # pragma: no cover
prev_rowdata = tuple(rowdata)
output.extend(rowdata[1:])
return bytes(output)
@staticmethod
def encode(data: bytes, level: int = -1) -> bytes:
"""
Compress the input data using zlib.
Args:
data: The data to be compressed.
level: See https://docs.python.org/3/library/zlib.html#zlib.compress
Returns:
The compressed data.
"""
return zlib.compress(data, level)
class ASCIIHexDecode:
"""
The ASCIIHexDecode filter decodes data that has been encoded in ASCII
hexadecimal form into a base-7 ASCII format.
"""
@staticmethod
def decode(
data: Union[str, bytes],
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
"""
Decode an ASCII-Hex encoded data stream.
Args:
data: a str sequence of hexadecimal-encoded values to be
converted into a base-7 ASCII string
decode_parms: a string conversion in base-7 ASCII, where each of its values
v is such that 0 <= ord(v) <= 127.
Returns:
A string conversion in base-7 ASCII, where each of its values
v is such that 0 <= ord(v) <= 127.
Raises:
PdfStreamError:
"""
# decode_parms is unused here
if isinstance(data, str):
data = data.encode()
retval = b""
hex_pair = b""
index = 0
while True:
if index >= len(data):
logger_warning(
"missing EOD in ASCIIHexDecode, check if output is OK", __name__
)
break # Reached end of string even if no EOD
char = data[index : index + 1]
if char == b">":
break
if char.isspace():
index += 1
continue
hex_pair += char
if len(hex_pair) == 2:
retval += bytes((int(hex_pair, base=16),))
hex_pair = b""
index += 1
assert hex_pair == b""
return retval
class RunLengthDecode:
"""
The RunLengthDecode filter decodes data that has been encoded in a
simple byte-oriented format based on run length.
The encoded data is a sequence of runs, where each run consists of
a length byte followed by 1 to 128 bytes of data. If the length byte is
in the range 0 to 127,
the following length + 1 (1 to 128) bytes are copied literally during
decompression.
If length is in the range 129 to 255, the following single byte is to be
copied 257 length (2 to 128) times during decompression. A length value
of 128 denotes EOD.
"""
@staticmethod
def decode(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
"""
Decode a run length encoded data stream.
Args:
data: a bytes sequence of length/data
decode_parms: ignored.
Returns:
A bytes decompressed sequence.
Raises:
PdfStreamError:
"""
# decode_parms is unused here
lst = []
index = 0
while True:
if index >= len(data):
logger_warning(
"missing EOD in RunLengthDecode, check if output is OK", __name__
)
break # reach End Of String even if no EOD
length = data[index]
index += 1
if length == 128:
if index < len(data):
raise PdfStreamError("Early EOD in RunLengthDecode")
else:
break
elif length < 128:
length += 1
lst.append(data[index : (index + length)])
index += length
else: # >128
length = 257 - length
lst.append(bytes((data[index],)) * length)
index += 1
return b"".join(lst)
class LZWDecode:
class Decoder:
STOP = 257
CLEARDICT = 256
def __init__(self, data: bytes) -> None:
self.data = data
def decode(self) -> bytes:
return _LzwCodec().decode(self.data)
@staticmethod
def _decodeb(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
"""
Decode an LZW encoded data stream.
Args:
data: ``bytes`` or ``str`` text to decode.
decode_parms: a dictionary of parameter values.
Returns:
decoded data.
"""
# decode_parms is unused here
return LZWDecode.Decoder(data).decode()
@staticmethod
def decode(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> str: # deprecated
"""
Decode an LZW encoded data stream.
Args:
data: ``bytes`` or ``str`` text to decode.
decode_parms: a dictionary of parameter values.
Returns:
decoded data.
"""
# decode_parms is unused here
deprecate("LZWDecode.decode will return bytes instead of str in pypdf 6.0.0")
return LZWDecode.Decoder(data).decode().decode("latin-1")
class ASCII85Decode:
"""Decodes string ASCII85-encoded data into a byte format."""
@staticmethod
def decode(
data: Union[str, bytes],
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
"""
Decode an Ascii85 encoded data stream.
Args:
data: ``bytes`` or ``str`` text to decode.
decode_parms: a dictionary of parameter values.
Returns:
decoded data.
"""
if isinstance(data, str):
data = data.encode()
data = data.strip(WHITESPACES_AS_BYTES)
try:
return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES)
except ValueError as error:
if error.args[0] == "Ascii85 encoded byte sequences must end with b'~>'":
logger_warning("Ignoring missing Ascii85 end marker.", __name__)
return a85decode(data, adobe=False, ignorechars=WHITESPACES_AS_BYTES)
raise
class DCTDecode:
@staticmethod
def decode(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
# decode_parms is unused here
return data
class JPXDecode:
@staticmethod
def decode(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
# decode_parms is unused here
return data
@dataclass
class CCITTParameters:
"""§7.4.6, optional parameters for the CCITTFaxDecode filter."""
K: int = 0
columns: int = 0
rows: int = 0
EndOfBlock: Union[int, None] = None
EndOfLine: Union[int, None] = None
EncodedByteAlign: Union[int, None] = None
DamagedRowsBeforeError: Union[int, None] = None
@property
def group(self) -> int:
if self.K < 0:
# Pure two-dimensional encoding (Group 4)
CCITTgroup = 4
else:
# K == 0: Pure one-dimensional encoding (Group 3, 1-D)
# K > 0: Mixed one- and two-dimensional encoding (Group 3, 2-D)
CCITTgroup = 3
return CCITTgroup
def __create_old_class_instance(
K: int = 0,
columns: int = 0,
rows: int = 0
) -> CCITTParameters:
deprecate_with_replacement("CCITParameters", "CCITTParameters", "6.0.0")
return CCITTParameters(K, columns, rows)
# Create an alias for the old class name
CCITParameters = __create_old_class_instance
class CCITTFaxDecode:
"""
§7.4.6, CCITTFaxDecode filter (ISO 32000).
Either Group 3 or Group 4 CCITT facsimile (fax) encoding.
CCITT encoding is bit-oriented, not byte-oriented.
§7.4.6, optional parameters for the CCITTFaxDecode filter.
"""
@staticmethod
def _get_parameters(
parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject],
rows: Union[int, IndirectObject],
) -> CCITTParameters:
# §7.4.6, optional parameters for the CCITTFaxDecode filter
k = 0
columns = 1728
if parameters:
parameters_unwrapped = cast(
Union[ArrayObject, DictionaryObject], parameters.get_object()
)
if isinstance(parameters_unwrapped, ArrayObject):
for decode_parm in parameters_unwrapped:
if CCITT.COLUMNS in decode_parm:
columns = decode_parm[CCITT.COLUMNS].get_object()
if CCITT.K in decode_parm:
k = decode_parm[CCITT.K].get_object()
else:
if CCITT.COLUMNS in parameters_unwrapped:
columns = parameters_unwrapped[CCITT.COLUMNS].get_object() # type: ignore
if CCITT.K in parameters_unwrapped:
k = parameters_unwrapped[CCITT.K].get_object() # type: ignore
return CCITTParameters(K=k, columns=columns, rows=int(rows))
@staticmethod
def decode(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
height: int = 0,
**kwargs: Any,
) -> bytes:
# decode_parms is unused here
if isinstance(decode_parms, ArrayObject): # deprecated
deprecation_no_replacement(
"decode_parms being an ArrayObject", removed_in="3.15.5"
)
params = CCITTFaxDecode._get_parameters(decode_parms, height)
img_size = len(data)
tiff_header_struct = "<2shlh" + "hhll" * 8 + "h"
tiff_header = struct.pack(
tiff_header_struct,
b"II", # Byte order indication: Little endian
42, # Version number (always 42)
8, # Offset to first IFD
8, # Number of tags in IFD
256,
4,
1,
params.columns, # ImageWidth, LONG, 1, width
257,
4,
1,
params.rows, # ImageLength, LONG, 1, length
258,
3,
1,
1, # BitsPerSample, SHORT, 1, 1
259,
3,
1,
params.group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
262,
3,
1,
0, # Thresholding, SHORT, 1, 0 = WhiteIsZero
273,
4,
1,
struct.calcsize(
tiff_header_struct
), # StripOffsets, LONG, 1, length of header
278,
4,
1,
params.rows, # RowsPerStrip, LONG, 1, length
279,
4,
1,
img_size, # StripByteCounts, LONG, 1, size of image
0, # last IFD
)
return tiff_header + data
def decode_stream_data(stream: Any) -> bytes:
"""
Decode the stream data based on the specified filters.
This function decodes the stream data using the filters provided in the
stream.
Args:
stream: The input stream object containing the data and filters.
Returns:
The decoded stream data.
Raises:
NotImplementedError: If an unsupported filter type is encountered.
"""
filters = stream.get(SA.FILTER, ())
if isinstance(filters, IndirectObject):
filters = cast(ArrayObject, filters.get_object())
if not isinstance(filters, ArrayObject):
# We have a single filter instance
filters = (filters,)
decode_parms = stream.get(SA.DECODE_PARMS, ({},) * len(filters))
if not isinstance(decode_parms, (list, tuple)):
decode_parms = (decode_parms,)
data: bytes = stream._data
# If there is not data to decode we should not try to decode the data.
if not data:
return data
for filter_name, params in zip(filters, decode_parms):
if isinstance(params, NullObject):
params = {}
if filter_name in (FT.ASCII_HEX_DECODE, FTA.AHx):
data = ASCIIHexDecode.decode(data)
elif filter_name in (FT.ASCII_85_DECODE, FTA.A85):
data = ASCII85Decode.decode(data)
elif filter_name in (FT.LZW_DECODE, FTA.LZW):
data = LZWDecode._decodeb(data, params)
elif filter_name in (FT.FLATE_DECODE, FTA.FL):
data = FlateDecode.decode(data, params)
elif filter_name in (FT.RUN_LENGTH_DECODE, FTA.RL):
data = RunLengthDecode.decode(data)
elif filter_name == FT.CCITT_FAX_DECODE:
height = stream.get(IA.HEIGHT, ())
data = CCITTFaxDecode.decode(data, params, height)
elif filter_name == FT.DCT_DECODE:
data = DCTDecode.decode(data)
elif filter_name == FT.JPX_DECODE:
data = JPXDecode.decode(data)
elif filter_name == "/Crypt":
if "/Name" in params or "/Type" in params:
raise NotImplementedError(
"/Crypt filter with /Name or /Type not supported yet"
)
else:
raise NotImplementedError(f"Unsupported filter {filter_name}")
return data
def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, Any]:
"""
Users need to have the pillow package installed.
It's unclear if pypdf will keep this function here, hence it's private.
It might get removed at any point.
Args:
x_object_obj:
Returns:
Tuple[file extension, bytes, PIL.Image.Image]
"""
from ._xobj_image_helpers import (
Image,
UnidentifiedImageError,
_apply_decode,
_extended_image_frombytes,
_get_mode_and_invert_color,
_handle_flate,
_handle_jpx,
)
def _apply_alpha(
img: Image.Image,
x_object_obj: Dict[str, Any],
obj_as_text: str,
image_format: str,
extension: str,
) -> Tuple[Image.Image, str, str]:
alpha = None
if IA.S_MASK in x_object_obj: # add alpha channel
alpha = _xobj_to_image(x_object_obj[IA.S_MASK])[2]
if img.size != alpha.size:
logger_warning(
f"image and mask size not matching: {obj_as_text}", __name__
)
else:
# TODO : implement mask
if alpha.mode != "L":
alpha = alpha.convert("L")
if img.mode == "P":
img = img.convert("RGB")
elif img.mode == "1":
img = img.convert("L")
img.putalpha(alpha)
if "JPEG" in image_format:
extension = ".jp2"
image_format = "JPEG2000"
else:
extension = ".png"
image_format = "PNG"
return img, extension, image_format
# for error reporting
obj_as_text = (
x_object_obj.indirect_reference.__repr__()
if x_object_obj is None # pragma: no cover
else x_object_obj.__repr__()
)
# Get size and data
size = (cast(int, x_object_obj[IA.WIDTH]), cast(int, x_object_obj[IA.HEIGHT]))
data = x_object_obj.get_data() # type: ignore
if isinstance(data, str): # pragma: no cover
data = data.encode()
if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n'
data = data[:-1]
# Get color properties
colors = x_object_obj.get("/Colors", 1)
color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object()
if isinstance(color_space, list) and len(color_space) == 1:
color_space = color_space[0].get_object()
mode, invert_color = _get_mode_and_invert_color(x_object_obj, colors, color_space)
# Get filters
filters = x_object_obj.get(SA.FILTER, NullObject()).get_object()
lfilters = filters[-1] if isinstance(filters, list) else filters
extension = None
if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE):
img, image_format, extension, _ = _handle_flate(
size,
data,
mode,
color_space,
colors,
obj_as_text,
)
elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE, FT.CCITT_FAX_DECODE):
# I'm not sure if the following logic is correct.
# There might not be any relationship between the filters and the
# extension
if lfilters in (FT.LZW_DECODE, FT.CCITT_FAX_DECODE):
extension = ".tiff" # mime_type = "image/tiff"
image_format = "TIFF"
else:
extension = ".png" # mime_type = "image/png"
image_format = "PNG"
try:
img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
except UnidentifiedImageError:
img = _extended_image_frombytes(mode, size, data)
elif lfilters == FT.DCT_DECODE:
img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
# invert_color kept unchanged
elif lfilters == FT.JPX_DECODE:
img, image_format, extension, invert_color = _handle_jpx(
size, data, mode, color_space, colors
)
elif lfilters == FT.CCITT_FAX_DECODE:
img, image_format, extension, invert_color = (
Image.open(BytesIO(data), formats=("TIFF",)),
"TIFF",
".tiff",
False,
)
elif mode == "CMYK":
img, image_format, extension, invert_color = (
_extended_image_frombytes(mode, size, data),
"TIFF",
".tif",
False,
)
elif mode == "":
raise PdfReadError(f"ColorSpace field not found in {x_object_obj}")
else:
img, image_format, extension, invert_color = (
_extended_image_frombytes(mode, size, data),
"PNG",
".png",
False,
)
img = _apply_decode(img, x_object_obj, lfilters, color_space, invert_color)
img, extension, image_format = _apply_alpha(
img, x_object_obj, obj_as_text, image_format, extension
)
# Save image to bytes
img_byte_arr = BytesIO()
try:
img.save(img_byte_arr, format=image_format)
except OSError: # pragma: no cover # covered with pillow 10.3
# in case of we convert to RGBA and then to PNG
img1 = img.convert("RGBA")
image_format = "PNG"
extension = ".png"
img_byte_arr = BytesIO()
img1.save(img_byte_arr, format=image_format)
data = img_byte_arr.getvalue()
try: # temporary try/except until other fixes of images
img = Image.open(BytesIO(data))
except Exception:
img = None # type: ignore
return extension, data, img

View File

@ -0,0 +1,238 @@
# Copyright (c) 2006, Mathieu Fenniak
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# * The name of the author may not be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""Implementation of generic PDF objects (dictionary, number, string, ...)."""
__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
from typing import List, Optional, Tuple, Union
from .._utils import (
deprecation_with_replacement,
)
from ..constants import OutlineFontFlag
from ._base import (
BooleanObject,
ByteStringObject,
FloatObject,
IndirectObject,
NameObject,
NullObject,
NumberObject,
PdfObject,
TextStringObject,
encode_pdfdocencoding,
is_null_or_none,
)
from ._data_structures import (
ArrayObject,
ContentStream,
DecodedStreamObject,
Destination,
DictionaryObject,
EncodedStreamObject,
Field,
StreamObject,
TreeObject,
read_object,
)
from ._files import EmbeddedFile
from ._fit import Fit
from ._outline import OutlineItem
from ._rectangle import RectangleObject
from ._utils import (
create_string_object,
decode_pdfdocencoding,
hex_to_rgb,
read_hex_string_from_stream,
read_string_from_stream,
)
from ._viewerpref import ViewerPreferences
PAGE_FIT = Fit.fit()
class AnnotationBuilder: # deprecated
"""
The AnnotationBuilder is deprecated.
Instead, use the annotation classes in pypdf.annotations.
See `adding PDF annotations <../user/adding-pdf-annotations.html>`_ for
its usage combined with PdfWriter.
"""
from ..generic._rectangle import RectangleObject
@staticmethod
def text(
rect: Union[RectangleObject, Tuple[float, float, float, float]],
text: str,
open: bool = False,
flags: int = 0,
) -> None:
deprecation_with_replacement(
"AnnotationBuilder.text", "pypdf.annotations.Text", "5.0.0"
)
@staticmethod
def free_text(
text: str,
rect: Union[RectangleObject, Tuple[float, float, float, float]],
font: str = "Helvetica",
bold: bool = False,
italic: bool = False,
font_size: str = "14pt",
font_color: str = "000000",
border_color: Optional[str] = "000000",
background_color: Optional[str] = "ffffff",
) -> None:
deprecation_with_replacement(
"AnnotationBuilder.free_text", "pypdf.annotations.FreeText", "5.0.0"
)
@staticmethod
def popup(
*,
rect: Union[RectangleObject, Tuple[float, float, float, float]],
flags: int = 0,
parent: Optional[DictionaryObject] = None,
open: bool = False,
) -> None:
deprecation_with_replacement(
"AnnotationBuilder.popup", "pypdf.annotations.Popup", "5.0.0"
)
@staticmethod
def line(
p1: Tuple[float, float],
p2: Tuple[float, float],
rect: Union[RectangleObject, Tuple[float, float, float, float]],
text: str = "",
title_bar: Optional[str] = None,
) -> None:
deprecation_with_replacement(
"AnnotationBuilder.line", "pypdf.annotations.Line", "5.0.0"
)
@staticmethod
def polyline(
vertices: List[Tuple[float, float]],
) -> None:
deprecation_with_replacement(
"AnnotationBuilder.polyline", "pypdf.annotations.PolyLine", "5.0.0"
)
@staticmethod
def rectangle(
rect: Union[RectangleObject, Tuple[float, float, float, float]],
interiour_color: Optional[str] = None,
) -> None:
deprecation_with_replacement(
"AnnotationBuilder.rectangle", "pypdf.annotations.Rectangle", "5.0.0"
)
@staticmethod
def highlight(
*,
rect: Union[RectangleObject, Tuple[float, float, float, float]],
quad_points: ArrayObject,
highlight_color: str = "ff0000",
printing: bool = False,
) -> None:
deprecation_with_replacement(
"AnnotationBuilder.highlight", "pypdf.annotations.Highlight", "5.0.0"
)
@staticmethod
def ellipse(
rect: Union[RectangleObject, Tuple[float, float, float, float]],
interiour_color: Optional[str] = None,
) -> None:
deprecation_with_replacement(
"AnnotationBuilder.ellipse", "pypdf.annotations.Ellipse", "5.0.0"
)
@staticmethod
def polygon(vertices: List[Tuple[float, float]]) -> None:
deprecation_with_replacement(
"AnnotationBuilder.polygon", "pypdf.annotations.Polygon", "5.0.0"
)
from ._fit import DEFAULT_FIT
@staticmethod
def link(
rect: Union[RectangleObject, Tuple[float, float, float, float]],
border: Optional[ArrayObject] = None,
url: Optional[str] = None,
target_page_index: Optional[int] = None,
fit: Fit = DEFAULT_FIT,
) -> None:
deprecation_with_replacement(
"AnnotationBuilder.link", "pypdf.annotations.Link", "5.0.0"
)
__all__ = [
"PAGE_FIT",
"AnnotationBuilder",
"ArrayObject",
"BooleanObject",
"ByteStringObject",
"ContentStream",
"DecodedStreamObject",
"Destination",
"DictionaryObject",
"EmbeddedFile",
"EncodedStreamObject",
"Field",
"Fit",
"FloatObject",
"IndirectObject",
"NameObject",
"NullObject",
"NumberObject",
"OutlineFontFlag",
"OutlineItem",
"PdfObject",
"RectangleObject",
"StreamObject",
"TextStringObject",
"TreeObject",
"ViewerPreferences",
# Utility functions
"create_string_object",
"decode_pdfdocencoding",
"encode_pdfdocencoding",
"hex_to_rgb",
"is_null_or_none",
"read_hex_string_from_stream",
# Data structures core functions
"read_object",
"read_string_from_stream",
]

Some files were not shown because too many files have changed in this diff Show More