Fixed ° and the overload ssue
This commit is contained in:
parent
d51d597e87
commit
322d662011
@ -3,3 +3,4 @@ pdfs/
|
||||
node_modules/
|
||||
pycache/
|
||||
cloned_repo/
|
||||
extracted_texts/
|
||||
@ -2,7 +2,7 @@ Control Panel,Unit Number,Alias,Equipment Type,Type of Conveyor,Speed,Drive Hand
|
||||
BULK INBOUND NORTH,BS1-005,PS10-1,Powered-Belted,Level Belt,150,RH,460/3/60,25,5,Y,N,RPH3200BXB-FR,NA,24,12,N,Y,NA,NA,"36"" SG, Bi-directional"
|
||||
BULK INBOUND NORTH,BS1-010,PS10-2,Powered-Belted,Incline Belt,200,LH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24,,N,Y,NA,NA,"36"" SG"
|
||||
BULK INBOUND NORTH,BS1-011,PS10-3,Powered-Belted,Incline Belt,240,RH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24.00'',12'',,,,,
|
||||
BULK INBOUND NORTH,BS1-015-CH,PS10-4CH,90<EFBFBD> Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS1-015-CH,PS10-4CH,90° Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS1-020,PS10-5,Powered-Belted,Level Belt,240,LH,460/3/60,25,15,Y,N,RPH3200BXB-FR,NA,24.00'',12'',N,Y,Y,QTY 4,
|
||||
BULK INBOUND NORTH,BS1-020-CH1,PS10-5CH1,Induct Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS1-020-CH2,PS10-5CH2,Induct Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
@ -12,17 +12,17 @@ BULK INBOUND NORTH,BS1-020-DIV1,PS10-5DIV1,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,N
|
||||
BULK INBOUND NORTH,BS1-020-DIV2,PS10-5DIV2,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS1-020-DIV3,PS10-5DIV3,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS1-020-DIV4,PS10-5DIV5,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS1-025-CH,PS10-6CH,90<EFBFBD> Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS1-025-CH,PS10-6CH,90° Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-005,PS11-1,Powered-Belted,Level Belt,150,RH,460/3/60,25,5,Y,N,RPH3200BXB-FR,NA,24.00'',12'',N,Y,NA,NA,"36"" SG"
|
||||
BULK INBOUND NORTH,BS2-010,PS11-2,Powered-Belted,Incline Belt,200,RH,460/3/60,25,5,Y,N,APH150MFOXLN,NA,24.00'',12'',Y,Y,NA,NA,"36"" SG"
|
||||
BULK INBOUND NORTH,BS2-015,PS11-3,Powered-Belted,Level Belt,240,RH,460/3/60,25,10,Y,N,RPH3200BXB-FR,NA,24.00'',12'',Y,Y,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-020,PS11-4,Powered-Belted,Incline Belt,240,LH,460/3/60,25,10,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-025-CH,PS11-5CH,90<EFBFBD> Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-025-CH,PS11-5CH,90° Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-030,PS11-6,Powered-Belted,Incline Belt,240,RH,460/3/60,25,10,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,"36"" SG"
|
||||
BULK INBOUND NORTH,BS2-035,PS11-7,Powered-Belted,Incline Belt,240,RH,460/3/60,25,20,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,"36"" SG"
|
||||
BULK INBOUND NORTH,BS2-040,PS11-8,Powered-Belted,Incline Belt,240,RH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,"36"" SG"
|
||||
BULK INBOUND NORTH,BS2-045,PS11-9,Powered-Belted,Incline Belt,240,RH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24.00'',12'',N,Y,NA,NA,"36"" SG"
|
||||
BULK INBOUND NORTH,BS2-050-CH,PS11-10CH,90<EFBFBD> Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-050-CH,PS11-10CH,90° Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-055,PS11-11,Powered-Belted,Level Belt,240,LH,460/3/60,25,15,Y,N,RPH3200BXB-FR,NA,24.00'',12'',N,Y,Y,QTY 6,
|
||||
BULK INBOUND NORTH,BS2-055-CH1,PS11-11CH1,Induct Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-055-CH2,PS11-11CH2,Induct Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
@ -36,17 +36,17 @@ BULK INBOUND NORTH,BS2-055-DIV3,PS11-11DIV3,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-055-DIV4,PS11-11DIV4,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-055-DIV5,PS11-11DIV5,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-055-DIV6,PS11-11DIV6,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-060-CH,PS1-12CH,90<EFBFBD> Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND NORTH,BS2-060-CH,PS1-12CH,90° Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-005,PS8-1,Powered-Belted,Level Belt,150,LH,460/3/60,25,7.5,Y,N,RPH3200BXB-FR,NA,24.00'',12'',N,Y,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-010,PS8-2,Powered-Belted,Incline Belt,200,LH,460/3/60,25,10,Y,N,RPH3200BXB-FR,NA,24.00'',12'',Y,Y,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-015-CH,PS8-3CH,90<EFBFBD> Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-015-CH,PS8-3CH,90° Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-020,PS8-4,Powered-Belted,Incline Belt,240,LH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,"36"" SG"
|
||||
BULK INBOUND SOUTH,BS3-025,PS8-5,Powered-Belted,Incline Belt,240,LH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24.00'',12'',N,Y,NA,NA,"36"" SG"
|
||||
BULK INBOUND SOUTH,BS3-030,PS8-6,Sorter,Intralox Flowsplitter,240,LH,460/3/60,25,5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-035-CH,PS8-7CH,Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-040,PS8-8,Powered-Belted,Incline Belt,240,RH,460/3/60,25,7.5,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-045,PS8-9,Powered-Belted,Incline Belt,240,LH,460/3/60,25,15,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-050-CH,PS8-10CH,90<EFBFBD> Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-050-CH,PS8-10CH,90° Spiral Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-055,PS8-11,Powered-Belted,Level Belt,240,RH,460/3/60,25,10,Y,N,RPH3200BXB-FR,NA,24.00'',12'',N,Y,Y,QTY 4,
|
||||
BULK INBOUND SOUTH,BS3-055-CH1,PS8-11CH1,Induct Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-055-CH2,PS8-11CH2,Induct Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
@ -56,7 +56,7 @@ BULK INBOUND SOUTH,BS3-055-DIV1,PS8-11DIV1,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,N
|
||||
BULK INBOUND SOUTH,BS3-055-DIV2,PS8-11DIV2,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-055-DIV3,PS8-11DIV3,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-055-DIV4,PS8-11DIV4,Sorter,Divert Arm,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-060-CH,PS8-12CH,90<EFBFBD> Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS3-060-CH,PS8-12CH,90° Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS4-005-CH,PS9-1CH,Straight Chute,Chute,NA,NA,NA,25,NA,NA,NA,NA,NA,24.00'',24.00'',NA,NA,NA,NA,
|
||||
BULK INBOUND SOUTH,BS4-010,PS9-2,Powered-Belted,Decline Belt,240,RH,460/3/60,25,7.5,Y,N,APH150MFOXLN,NA,24.00'',-,N,Y,NA,NA,
|
||||
BULK INBOUND SOUTH,BS4-015,PS9-3,Powered-Belted,Level Belt,240,RH,460/3/60,25,7.5,Y,N,RPH3200BXB-FR,NA,24.00'',12'',N,Y,Y,QTY 4,
|
||||
|
||||
|
130
app.py
130
app.py
@ -54,7 +54,8 @@ def get_views_dir_path():
|
||||
def get_text_output_dir_path():
|
||||
# Construct absolute path based on the script's directory
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
return os.path.join(script_dir, TEXT_OUTPUT_FOLDER)
|
||||
# Use os.path.join to handle path separators correctly and avoid './'
|
||||
return os.path.abspath(os.path.join(script_dir, TEXT_OUTPUT_FOLDER))
|
||||
|
||||
def normalize(text):
|
||||
"""Normalize string for comparison: lowercase, treat '-' and '_' the same, remove all whitespace."""
|
||||
@ -68,12 +69,14 @@ def normalize(text):
|
||||
def read_manifest(csv_filepath):
|
||||
"""Reads the manifest CSV into a list of dictionaries."""
|
||||
manifest_items = []
|
||||
# Only require Alias and Panel now for basic grouping
|
||||
required_cols = {CSV_ALIAS_COL, CSV_PANEL_COL}
|
||||
optional_cols = {CSV_EQ_TYPE_COL, CSV_CONV_TYPE_COL}
|
||||
try:
|
||||
with open(csv_filepath, mode='r', newline='', encoding='utf-8') as infile:
|
||||
# Revert back to 'utf-8-sig' to handle potential BOM from Excel
|
||||
with open(csv_filepath, mode='r', newline='', encoding='utf-8-sig') as infile:
|
||||
reader = csv.DictReader(infile)
|
||||
headers = set(h.strip() for h in reader.fieldnames) # Handle potential whitespace in headers
|
||||
headers = set(h.strip() for h in reader.fieldnames)
|
||||
|
||||
# Check for required columns
|
||||
missing_required = required_cols - headers
|
||||
@ -85,12 +88,16 @@ def read_manifest(csv_filepath):
|
||||
for row in reader:
|
||||
alias = row.get(CSV_ALIAS_COL, "").strip()
|
||||
panel = row.get(CSV_PANEL_COL, "").strip()
|
||||
if alias and panel: # Only add if Alias and Control Panel are present
|
||||
# unit_number = row.get('Unit Number', "").strip() # No longer needed for filename
|
||||
|
||||
# Add if Alias and Control Panel are present (Panel needed for grouping results later)
|
||||
if alias and panel:
|
||||
item = {
|
||||
"alias": alias,
|
||||
"normalized_alias": normalize(alias),
|
||||
"control_panel": panel,
|
||||
"expected_drawing_filename": f"{panel}.txt", # Assuming .txt file matches panel name
|
||||
# "unit_number": unit_number, # Removed
|
||||
# "expected_drawing_filename": f"MTN6_SYSDL-{unit_number}.txt", # Removed
|
||||
# Add optional data if columns exist
|
||||
"equipment_type": row.get(CSV_EQ_TYPE_COL, "").strip() if CSV_EQ_TYPE_COL in headers else "N/A",
|
||||
"conveyor_type": row.get(CSV_CONV_TYPE_COL, "").strip() if CSV_CONV_TYPE_COL in headers else "N/A",
|
||||
@ -99,8 +106,11 @@ def read_manifest(csv_filepath):
|
||||
"found_drawing": False
|
||||
}
|
||||
manifest_items.append(item)
|
||||
# elif alias and panel: # If Unit Number is missing but others are present # Condition removed
|
||||
# print(f"Warning: Alias '{alias}' in Panel '{panel}' is missing 'Unit Number' in CSV. Skipping drawing check for this item.")
|
||||
elif alias and not panel:
|
||||
print(f"Warning: Alias '{alias}' found in CSV but is missing its '{CSV_PANEL_COL}'. Skipping.")
|
||||
# Add other specific warnings if needed
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"Error: Manifest file not found at {csv_filepath}")
|
||||
@ -154,44 +164,50 @@ def check_scada(manifest_data, views_dir):
|
||||
|
||||
|
||||
def check_drawings(manifest_data, text_output_dir):
|
||||
"""Checks for aliases in extracted drawing text files, one file per panel."""
|
||||
"""Checks if aliases from manifest exist in *any* extracted drawing text file."""
|
||||
if not manifest_data: return
|
||||
print(f"Starting Drawings check in directory: {text_output_dir}...")
|
||||
found_count = 0
|
||||
file_cache = {} # Cache normalized content of processed text files
|
||||
print(f"Starting Drawings check: Scanning all .txt files in directory: {text_output_dir}...")
|
||||
|
||||
for item in manifest_data:
|
||||
normalized_alias = item['normalized_alias']
|
||||
txt_filename = item['expected_drawing_filename']
|
||||
txt_filepath = os.path.join(text_output_dir, txt_filename)
|
||||
all_normalized_content = "" # Combine all text content here
|
||||
processed_files = 0
|
||||
found_files = []
|
||||
|
||||
try:
|
||||
# Check cache first
|
||||
if txt_filepath in file_cache:
|
||||
normalized_content = file_cache[txt_filepath]
|
||||
# Read and cache if not already processed
|
||||
elif os.path.exists(txt_filepath):
|
||||
with open(txt_filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
normalized_content = normalize(content)
|
||||
file_cache[txt_filepath] = normalized_content # Cache it
|
||||
else:
|
||||
# File doesn't exist, mark as not found in cache to avoid re-checking
|
||||
file_cache[txt_filepath] = None
|
||||
# print(f" Info: Expected drawing text file not found: {txt_filepath}")
|
||||
continue # Cannot find alias if file doesn't exist
|
||||
try:
|
||||
# Step 1: Read and combine content of all .txt files in the directory
|
||||
for filename in os.listdir(text_output_dir):
|
||||
if filename.lower().endswith('.txt'):
|
||||
filepath = os.path.join(text_output_dir, filename)
|
||||
processed_files += 1
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
# Add a separator to prevent false matches across file boundaries
|
||||
all_normalized_content += normalize(content) + "\n--file-separator--\n"
|
||||
found_files.append(filename)
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not read or process text file {filepath}: {e}")
|
||||
|
||||
# Perform check if file content exists
|
||||
if normalized_content is not None and normalized_alias in normalized_content:
|
||||
if not item['found_drawing']: # Avoid double counting if alias appears multiple times in manifest
|
||||
item['found_drawing'] = True
|
||||
found_count += 1
|
||||
if processed_files == 0:
|
||||
print(" Warning: No .txt files found in the directory. Cannot perform drawing check.")
|
||||
return
|
||||
else:
|
||||
print(f" Successfully read and normalized content from {len(found_files)} out of {processed_files} .txt files found.")
|
||||
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not read or process text file {txt_filepath}: {e}")
|
||||
file_cache[txt_filepath] = None # Mark as failed in cache
|
||||
# Step 2: Check each manifest alias against the combined content
|
||||
found_count = 0
|
||||
for item in manifest_data:
|
||||
normalized_alias = item['normalized_alias']
|
||||
if normalized_alias and normalized_alias in all_normalized_content:
|
||||
item['found_drawing'] = True
|
||||
found_count += 1
|
||||
# else: item['found_drawing'] is already False by default
|
||||
|
||||
print(f"Drawings check finished. Processed {len(file_cache)} unique text files. Found {found_count} manifest aliases.")
|
||||
print(f"Drawings check finished. Found {found_count} manifest aliases within the combined text content.")
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f" Error: Drawings text directory not found: {text_output_dir}")
|
||||
except Exception as e:
|
||||
print(f" Error during drawings check: {e}")
|
||||
|
||||
|
||||
def calculate_combined_progress(manifest_data):
|
||||
@ -323,10 +339,9 @@ def update_progress_data():
|
||||
status_message = current_status # Update status regardless of calculation success/failure
|
||||
if new_data_calculated is not None:
|
||||
progress_data = new_data_calculated
|
||||
|
||||
# Signal that an update attempt finished
|
||||
data_updated_event.set()
|
||||
data_updated_event.clear()
|
||||
# Signal that an update attempt finished WITH new data
|
||||
data_updated_event.set()
|
||||
data_updated_event.clear()
|
||||
|
||||
# --- Git Repo Handling (Modified slightly to use updated status messages) ---
|
||||
|
||||
@ -342,8 +357,6 @@ def check_and_update_repo():
|
||||
if not repo_existed:
|
||||
print(f"Cloning repository {REPO_URL} into {repo_path}...")
|
||||
status_message = f"Cloning repository {REPO_URL}..."
|
||||
# Signal status change during long operation
|
||||
data_updated_event.set(); data_updated_event.clear()
|
||||
git.Repo.clone_from(REPO_URL, repo_path, branch=BRANCH)
|
||||
repo = git.Repo(repo_path)
|
||||
last_commit_hash = repo.head.commit.hexsha
|
||||
@ -368,8 +381,6 @@ def check_and_update_repo():
|
||||
if current_local_commit != current_remote_commit:
|
||||
print("New commit detected! Pulling changes...")
|
||||
status_message = "Pulling updates..."
|
||||
# Signal status change during potentially long operation
|
||||
data_updated_event.set(); data_updated_event.clear()
|
||||
try:
|
||||
pull_info = origin.pull()
|
||||
new_commit_hash = repo.head.commit.hexsha
|
||||
@ -392,9 +403,10 @@ def check_and_update_repo():
|
||||
# Status will be updated within update_progress_data
|
||||
update_progress_data()
|
||||
# If no git update, signal any status change (e.g., "No changes" or error)
|
||||
else:
|
||||
data_updated_event.set() # Signal status change event
|
||||
data_updated_event.clear()
|
||||
# else: # REMOVED block that signaled event for no changes
|
||||
# REMOVED: data_updated_event.set() # Signal status change event
|
||||
# REMOVED: data_updated_event.clear()
|
||||
# Status message is still updated globally, just won't trigger event
|
||||
|
||||
except git.GitCommandError as e:
|
||||
status_message = f"Git command error: {e}"
|
||||
@ -407,14 +419,14 @@ def check_and_update_repo():
|
||||
if last_commit_hash is None: last_commit_hash = repo.head.commit.hexsha
|
||||
except Exception:
|
||||
if last_commit_hash is None: last_commit_hash = "Error reading commit"
|
||||
data_updated_event.set() # Signal error status change
|
||||
data_updated_event.clear()
|
||||
# REMOVED: data_updated_event.set() # Signal error status change
|
||||
# REMOVED: data_updated_event.clear()
|
||||
except Exception as e:
|
||||
status_message = f"Error checking repository: {e}"
|
||||
print(status_message)
|
||||
if last_commit_hash is None: last_commit_hash = "Error checking repo"
|
||||
data_updated_event.set() # Signal error status change
|
||||
data_updated_event.clear()
|
||||
# REMOVED: data_updated_event.set() # Signal error status change
|
||||
# REMOVED: data_updated_event.clear()
|
||||
|
||||
# Return true if analysis was run (because repo changed), false otherwise
|
||||
return did_update
|
||||
@ -439,6 +451,16 @@ def periodic_repo_check():
|
||||
def index():
|
||||
return render_template('index.html')
|
||||
|
||||
@app.route('/drawings')
|
||||
def drawings_page():
|
||||
# Render the main index template which now contains all content
|
||||
return render_template('index.html')
|
||||
|
||||
@app.route('/conflicts')
|
||||
def conflicts_page():
|
||||
# Render the main index template which now contains all content
|
||||
return render_template('index.html')
|
||||
|
||||
@app.route('/stream')
|
||||
def stream():
|
||||
def event_stream():
|
||||
@ -479,11 +501,11 @@ def stream():
|
||||
})
|
||||
yield f"data: {data_payload}\n\n"
|
||||
last_sent_hash_to_client = current_global_hash # Update the hash sent to this client
|
||||
else:
|
||||
# else: # No need for the else block logging here anymore, as the event shouldn't trigger if hash is same
|
||||
# If hash is the same, maybe only the status message changed (e.g., error occurred)
|
||||
# Option: Send update only if status is different from last sent status?
|
||||
# For simplicity now, we only send if hash differs. Client UI shows last known status.
|
||||
print(f"Data updated event triggered, but hash {current_global_hash} unchanged for this client. Status: '{current_global_status}'")
|
||||
# print(f"Data updated event triggered, but hash {current_global_hash} unchanged for this client. Status: '{current_global_status}'") # Removed log
|
||||
|
||||
|
||||
return Response(event_stream(), mimetype="text/event-stream")
|
||||
|
||||
@ -1 +1 @@
|
||||
Subproject commit c8aa36809970e0557f46ee80b7f7cf3735efb487
|
||||
Subproject commit 456de12cca56c09bc1881660b163ac3b5dff593a
|
||||
@ -3,7 +3,7 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Ignition SCADA & Drawing Progress Monitor</title>
|
||||
<title>SCADA Progress Monitor</title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<style>
|
||||
@ -37,7 +37,7 @@
|
||||
margin: 0 auto; /* Center the canvas */
|
||||
cursor: pointer; /* Indicate clickable */
|
||||
}
|
||||
#panels-progress {
|
||||
#scada-panels-progress, #drawing-panels-progress {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); /* Responsive grid */
|
||||
gap: 20px;
|
||||
@ -47,25 +47,66 @@
|
||||
.modal-body th { background-color: #f8f9fa; text-align: left; }
|
||||
.status-yes { color: green; font-weight: bold; }
|
||||
.status-no { color: red; font-weight: bold; }
|
||||
nav { margin-bottom: 20px; } /* Added for nav spacing */
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1 class="mb-4">SCADA & Drawing Device Placement Progress</h1>
|
||||
<!-- Added Navigation -->
|
||||
<nav class="nav nav-pills">
|
||||
<a class="nav-link active" aria-current="page" href="/">SCADA Progress</a>
|
||||
<a class="nav-link" href="/drawings">Drawing Progress</a>
|
||||
<a class="nav-link" href="/conflicts">Conflicts</a>
|
||||
</nav>
|
||||
|
||||
<div id="overall-progress" class="chart-container">
|
||||
<span class="chart-label">Overall Progress</span>
|
||||
<canvas id="overall-chart-canvas" class="panel-chart-canvas" style="max-width: 200px; max-height: 200px;"></canvas>
|
||||
<div id="overall-text" style="font-weight: bold; margin-top: 10px;">Found Both: 0/0 (0%)</div>
|
||||
<!-- SCADA Content Section -->
|
||||
<div id="scada-content">
|
||||
<h1 class="mb-4">SCADA Device Placement Progress</h1>
|
||||
<p>Compares the Equipment Manifest against the SCADA view.json files.</p>
|
||||
|
||||
<div id="overall-scada-progress" class="chart-container">
|
||||
<span class="chart-label">Overall SCADA Progress</span>
|
||||
<canvas id="overall-scada-chart-canvas" class="panel-chart-canvas" style="max-width: 200px; max-height: 200px;"></canvas>
|
||||
<div id="overall-scada-text" style="font-weight: bold; margin-top: 10px;">Found in SCADA: 0/0 (0%)</div>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
|
||||
<h2>SCADA Progress by Control Panel</h2>
|
||||
<div id="scada-panels-progress">
|
||||
<p>Loading panel data...</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
<!-- Drawing Content Section (Initially Hidden) -->
|
||||
<div id="drawings-content" style="display: none;">
|
||||
<h1 class="mb-4">Drawing Device Placement Progress</h1>
|
||||
<p>Compares the Equipment Manifest against the extracted text from drawing files (.txt).</p>
|
||||
|
||||
<h2>Progress by Control Panel</h2>
|
||||
<div id="panels-progress">
|
||||
<!-- Charts will be loaded here -->
|
||||
<p>Loading panel data...</p>
|
||||
<div id="overall-drawing-progress" class="chart-container">
|
||||
<span class="chart-label">Overall Drawing Progress</span>
|
||||
<canvas id="overall-drawing-chart-canvas" class="panel-chart-canvas" style="max-width: 200px; max-height: 200px;"></canvas>
|
||||
<div id="overall-drawing-text" style="font-weight: bold; margin-top: 10px;">Found in Drawing: 0/0 (0%)</div>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
|
||||
<h2>Drawing Progress by Control Panel</h2>
|
||||
<div id="drawing-panels-progress">
|
||||
<p>Loading panel data...</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Conflicts Content Section (Initially Hidden) -->
|
||||
<div id="conflicts-content" style="display: none;">
|
||||
<h1 class="mb-4">SCADA/Drawing Conflicts <span id="conflict-count" class="badge bg-warning ms-2">0</span></h1>
|
||||
<p>Items found in SCADA views but <strong>not</strong> found in the extracted drawing text files.</p>
|
||||
|
||||
<div id="panels-conflicts">
|
||||
<p>Loading conflict data...</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<!-- Status Bar -->
|
||||
@ -89,7 +130,6 @@
|
||||
<th>Panel</th>
|
||||
<th>SCADA Status</th>
|
||||
<th>Drawing Status</th>
|
||||
<th>Expected Drawing File</th>
|
||||
<th>Equipment Type</th>
|
||||
<th>Type of Conveyor</th>
|
||||
</tr>
|
||||
@ -108,84 +148,332 @@
|
||||
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
|
||||
<script>
|
||||
let chartInstances = {};
|
||||
let progressDetailsData = {};
|
||||
// --- Global State Variables ---
|
||||
let chartInstancesScada = {}; // Separate instances for SCADA
|
||||
let chartInstancesDrawing = {}; // Separate instances for Drawing
|
||||
let progressDetailsData = {}; // Stores the raw data from SSE (shared)
|
||||
let previousCommitHash = null; // Single hash for the whole page
|
||||
let detailsModalInstance = null;
|
||||
let currentVisibleSection = 'scada'; // Track visible section: 'scada', 'drawing', 'conflicts'
|
||||
|
||||
// Define labels and colors consistently
|
||||
const chartLabels = ['Found Both', 'SCADA Only', 'Drawing Only', 'Missing Both'];
|
||||
const chartColors = [
|
||||
'rgb(25, 135, 84)', // Green (Found Both)
|
||||
'rgb(13, 202, 240)', // Cyan (SCADA Only)
|
||||
'rgb(255, 193, 7)', // Yellow (Drawing Only)
|
||||
'rgb(220, 53, 69)' // Red (Missing Both)
|
||||
];
|
||||
const listKeys = ['found_both_list', 'found_scada_only_list', 'found_drawing_only_list', 'missing_list'];
|
||||
// --- Chart Configurations ---
|
||||
const scadaChartLabels = ['Found in SCADA', 'Not Found in SCADA'];
|
||||
const scadaChartColors = ['rgb(13, 110, 253)', 'rgb(220, 53, 69)'];
|
||||
const drawingChartLabels = ['Found in Drawing', 'Not Found in Drawing'];
|
||||
const drawingChartColors = ['rgb(25, 135, 84)', 'rgb(220, 53, 69)'];
|
||||
|
||||
// --- Chart Click Handler (Updated) ---
|
||||
function handleChartClick(event, elements, chart) {
|
||||
// Map backend list keys for modal clicks (can be combined or kept separate if needed)
|
||||
const scadaListKeysMap = {
|
||||
found: ['found_both_list', 'found_scada_only_list'],
|
||||
notFound: ['found_drawing_only_list', 'missing_list']
|
||||
};
|
||||
const drawingListKeysMap = {
|
||||
found: ['found_both_list', 'found_drawing_only_list'],
|
||||
notFound: ['found_scada_only_list', 'missing_list']
|
||||
};
|
||||
|
||||
// --- Debounce Utility (Only need one) ---
|
||||
function debounce(func, wait) {
|
||||
let timeout;
|
||||
return function executedFunction(...args) {
|
||||
const later = () => {
|
||||
clearTimeout(timeout);
|
||||
func(...args);
|
||||
};
|
||||
clearTimeout(timeout);
|
||||
timeout = setTimeout(later, wait);
|
||||
};
|
||||
}
|
||||
|
||||
// --- Chart Click Handler (Needs context: SCADA or Drawing?) ---
|
||||
function handleChartClick(event, elements, chart, context) { // Added context
|
||||
if (elements.length > 0) {
|
||||
const clickedElementIndex = elements[0].index;
|
||||
const isOverallChart = chart.canvas.id === 'overall-chart-canvas';
|
||||
const identifier = isOverallChart ? '__overall__' : chart.canvas.id.replace('chart-', '');
|
||||
const isOverallChart = chart.canvas.id.startsWith('overall-'); // More robust check
|
||||
const identifier = isOverallChart ? '__overall__' : chart.canvas.id.replace(`chart-${context}-`, ''); // Use context
|
||||
const categoryType = clickedElementIndex === 0 ? 'found' : 'notFound';
|
||||
|
||||
// Map clicked index to the correct list type/key
|
||||
if (clickedElementIndex >= 0 && clickedElementIndex < listKeys.length) {
|
||||
const listType = listKeys[clickedElementIndex];
|
||||
showDetailsModal(identifier, listType);
|
||||
} else {
|
||||
console.warn("Clicked unknown chart segment index:", clickedElementIndex);
|
||||
}
|
||||
showDetailsModal(identifier, categoryType, context); // Pass context to modal
|
||||
}
|
||||
}
|
||||
|
||||
// --- UI Update Function (Heavily Updated) ---
|
||||
function updateUI(data) {
|
||||
console.log("Updating UI with data:", data);
|
||||
progressDetailsData = data.progress;
|
||||
// --- Core UI Update Functions (One for each section) ---
|
||||
|
||||
// Update status bar
|
||||
document.getElementById('status-message').textContent = data.status;
|
||||
document.getElementById('last-commit').textContent = data.last_commit || 'N/A';
|
||||
function updateUIScadaCore(data) {
|
||||
console.log("Running core SCADA UI redraw logic for commit:", data.last_commit);
|
||||
progressDetailsData = data.progress; // Update shared raw data
|
||||
|
||||
// --- Update Overall Chart & Text ---
|
||||
// --- Overall SCADA Chart ---
|
||||
const overallData = progressDetailsData.overall;
|
||||
const overallTotal = overallData.total_csv;
|
||||
const overallChartCounts = [
|
||||
overallData.found_both,
|
||||
overallData.found_scada_only,
|
||||
overallData.found_drawing_only,
|
||||
overallData.missing_both
|
||||
];
|
||||
// Update text (showing found both %)
|
||||
document.getElementById('overall-text').textContent = `Found Both: ${overallData.found_both}/${overallTotal} (${overallData.percentage_found_both}%)`;
|
||||
const overallFoundScada = overallData.found_both + overallData.found_scada_only;
|
||||
const overallNotFoundScada = overallData.found_drawing_only + overallData.missing_both;
|
||||
const overallPercentageFound = overallTotal > 0 ? ((overallFoundScada / overallTotal) * 100).toFixed(1) : 0;
|
||||
const overallChartCounts = [overallFoundScada, overallNotFoundScada];
|
||||
|
||||
const overallChartConfig = {
|
||||
document.getElementById('overall-scada-text').textContent = `Found in SCADA: ${overallFoundScada}/${overallTotal} (${overallPercentageFound}%)`;
|
||||
|
||||
// --- Only update/create chart if section is visible ---
|
||||
const isSectionVisible = (currentVisibleSection === 'scada');
|
||||
if (isSectionVisible) {
|
||||
const overallScadaCanvas = document.getElementById('overall-scada-chart-canvas');
|
||||
if (chartInstancesScada['overall']) {
|
||||
if (JSON.stringify(chartInstancesScada['overall'].data.datasets[0].data) !== JSON.stringify(overallChartCounts)) {
|
||||
chartInstancesScada['overall'].data.datasets[0].data = overallChartCounts;
|
||||
chartInstancesScada['overall'].update('none');
|
||||
}
|
||||
} else if (overallScadaCanvas) {
|
||||
console.log("Creating overall SCADA chart (visible).");
|
||||
const ctxOverall = overallScadaCanvas.getContext('2d');
|
||||
chartInstancesScada['overall'] = new Chart(ctxOverall, createChartConfig(overallChartCounts, overallTotal, 'scada', 'overall'));
|
||||
}
|
||||
} else {
|
||||
// If section is not visible, destroy the chart instance if it exists
|
||||
if (chartInstancesScada['overall']) {
|
||||
console.log("Destroying hidden overall SCADA chart.");
|
||||
chartInstancesScada['overall'].destroy();
|
||||
delete chartInstancesScada['overall'];
|
||||
}
|
||||
}
|
||||
|
||||
// --- SCADA Panel Charts ---
|
||||
const panelsContainer = document.getElementById('scada-panels-progress');
|
||||
const panelsData = progressDetailsData.panels || {};
|
||||
updatePanelCharts(panelsContainer, panelsData, chartInstancesScada, 'scada');
|
||||
|
||||
console.log("Finished SCADA UI core redraw.");
|
||||
}
|
||||
|
||||
function updateUIDrawingCore(data) {
|
||||
console.log("Running core Drawing UI redraw logic for commit:", data.last_commit);
|
||||
progressDetailsData = data.progress; // Update shared raw data
|
||||
|
||||
// --- Overall Drawing Chart ---
|
||||
const overallData = progressDetailsData.overall;
|
||||
const overallTotal = overallData.total_csv;
|
||||
const overallFoundDrawing = overallData.found_both + overallData.found_drawing_only;
|
||||
const overallNotFoundDrawing = overallData.found_scada_only + overallData.missing_both;
|
||||
const overallPercentageFound = overallTotal > 0 ? ((overallFoundDrawing / overallTotal) * 100).toFixed(1) : 0;
|
||||
const overallChartCounts = [overallFoundDrawing, overallNotFoundDrawing];
|
||||
|
||||
document.getElementById('overall-drawing-text').textContent = `Found in Drawing: ${overallFoundDrawing}/${overallTotal} (${overallPercentageFound}%)`;
|
||||
|
||||
// --- Only update/create chart if section is visible ---
|
||||
const isSectionVisible = (currentVisibleSection === 'drawings');
|
||||
if (isSectionVisible) {
|
||||
const overallDrawingCanvas = document.getElementById('overall-drawing-chart-canvas');
|
||||
if (chartInstancesDrawing['overall']) {
|
||||
if (JSON.stringify(chartInstancesDrawing['overall'].data.datasets[0].data) !== JSON.stringify(overallChartCounts)) {
|
||||
chartInstancesDrawing['overall'].data.datasets[0].data = overallChartCounts;
|
||||
chartInstancesDrawing['overall'].update('none');
|
||||
}
|
||||
} else if (overallDrawingCanvas) {
|
||||
console.log("Creating overall drawing chart (visible).");
|
||||
const ctxOverall = overallDrawingCanvas.getContext('2d');
|
||||
chartInstancesDrawing['overall'] = new Chart(ctxOverall, createChartConfig(overallChartCounts, overallTotal, 'drawing', 'overall'));
|
||||
}
|
||||
} else {
|
||||
// If section is not visible, destroy the chart instance if it exists
|
||||
if (chartInstancesDrawing['overall']) {
|
||||
console.log("Destroying hidden overall Drawing chart.");
|
||||
chartInstancesDrawing['overall'].destroy();
|
||||
delete chartInstancesDrawing['overall'];
|
||||
}
|
||||
}
|
||||
|
||||
// --- Drawing Panel Charts (call updatePanelCharts, which also checks visibility/destroys) ---
|
||||
const panelsContainer = document.getElementById('drawing-panels-progress');
|
||||
const panelsData = progressDetailsData.panels || {};
|
||||
console.log(`[updateUIDrawingCore] Found drawing panels container:`, panelsContainer ? panelsContainer.id : 'Not Found'); // Added Log
|
||||
updatePanelCharts(panelsContainer, panelsData, chartInstancesDrawing, 'drawings'); // Changed context to plural 'drawings'
|
||||
|
||||
console.log("Finished Drawing UI core redraw.");
|
||||
}
|
||||
|
||||
function updateUIConflictsCore(data) {
|
||||
console.log("Running core Conflicts UI redraw logic for commit:", data.last_commit);
|
||||
progressDetailsData = data.progress; // Update shared raw data
|
||||
|
||||
const panelsContainer = document.getElementById('panels-conflicts');
|
||||
panelsContainer.innerHTML = ''; // Clear previous
|
||||
|
||||
const panelsData = progressDetailsData.panels;
|
||||
let totalConflicts = 0;
|
||||
let panelsWithConflicts = 0;
|
||||
|
||||
if (!panelsData || Object.keys(panelsData).length === 0) {
|
||||
panelsContainer.innerHTML = '<p class="text-center fst-italic">No panel data available yet.</p>';
|
||||
} else {
|
||||
const sortedPanels = Object.keys(panelsData).sort();
|
||||
sortedPanels.forEach(panelName => {
|
||||
const panel = panelsData[panelName];
|
||||
const conflictsList = panel.found_scada_only_list || [];
|
||||
if (conflictsList.length > 0) {
|
||||
panelsWithConflicts++;
|
||||
totalConflicts += conflictsList.length;
|
||||
// ... (Create header and table as in conflicts.html) ...
|
||||
const panelHeader = document.createElement('h4');
|
||||
panelHeader.className = 'mt-4 mb-2';
|
||||
panelHeader.textContent = `${panelName} (${conflictsList.length} conflicts)`;
|
||||
panelsContainer.appendChild(panelHeader);
|
||||
|
||||
const table = document.createElement('table');
|
||||
table.className = 'table table-sm table-striped table-hover table-bordered';
|
||||
const thead = table.createTHead();
|
||||
thead.innerHTML = `<tr><th>Alias</th><th>Panel</th><th>SCADA Status</th><th>Drawing Status</th><th>Equipment Type</th><th>Type of Conveyor</th></tr>`;
|
||||
const tbody = table.createTBody();
|
||||
conflictsList.sort((a, b) => a.alias.localeCompare(b.alias)).forEach(item => {
|
||||
const row = tbody.insertRow();
|
||||
row.classList.add('table-warning');
|
||||
row.insertCell().textContent = item.alias;
|
||||
row.insertCell().textContent = item.control_panel;
|
||||
row.insertCell().innerHTML = '<span class="status-yes">Yes</span>';
|
||||
row.insertCell().innerHTML = '<span class="status-no">No</span>';
|
||||
row.insertCell().textContent = item.equipment_type || 'N/A';
|
||||
row.insertCell().textContent = item.conveyor_type || 'N/A';
|
||||
});
|
||||
panelsContainer.appendChild(table);
|
||||
}
|
||||
});
|
||||
if (panelsWithConflicts === 0) {
|
||||
panelsContainer.innerHTML = '<p class="text-center fst-italic">No conflicts found across all panels.</p>';
|
||||
}
|
||||
}
|
||||
// Update total count badge
|
||||
const countBadge = document.getElementById('conflict-count');
|
||||
if (countBadge) {
|
||||
countBadge.textContent = totalConflicts;
|
||||
countBadge.style.display = totalConflicts > 0 ? 'inline-block' : 'none';
|
||||
}
|
||||
console.log("Finished Conflicts UI core redraw.");
|
||||
}
|
||||
|
||||
// --- Generic Panel Chart Update Logic ---
|
||||
function updatePanelCharts(panelsContainer, panelsData, chartInstances, context) { // context: 'scada' or 'drawing'
|
||||
const incomingPanelNames = new Set(Object.keys(panelsData).sort());
|
||||
const existingInstanceNames = new Set(Object.keys(chartInstances).filter(k => k !== 'overall'));
|
||||
|
||||
// --- Check if the context matches the currently visible section ---
|
||||
const isSectionVisible = (context === currentVisibleSection);
|
||||
if (!isSectionVisible) {
|
||||
// If section is not visible, destroy existing panel chart instances for this context
|
||||
console.log(`Destroying hidden panel charts for context: ${context}`);
|
||||
existingInstanceNames.forEach(panelName => {
|
||||
if (chartInstances[panelName]) {
|
||||
chartInstances[panelName].destroy();
|
||||
delete chartInstances[panelName];
|
||||
}
|
||||
});
|
||||
// Don't proceed further if the section is hidden
|
||||
return;
|
||||
}
|
||||
|
||||
if (incomingPanelNames.size > 0) {
|
||||
const loadingMsg = panelsContainer.querySelector('p');
|
||||
if (loadingMsg) { loadingMsg.remove(); }
|
||||
|
||||
incomingPanelNames.forEach(panelName => {
|
||||
const panel = panelsData[panelName];
|
||||
const panelTotal = panel.total;
|
||||
let panelChartCounts;
|
||||
if (context === 'scada') {
|
||||
panelChartCounts = [panel.found_both + panel.found_scada_only, panel.found_drawing_only + panel.missing_both];
|
||||
} else { // drawing
|
||||
panelChartCounts = [panel.found_both + panel.found_drawing_only, panel.found_scada_only + panel.missing_both];
|
||||
}
|
||||
|
||||
// --- Only update/create chart if section is visible ---
|
||||
if (isSectionVisible) {
|
||||
if (chartInstances[panelName]) {
|
||||
if (JSON.stringify(chartInstances[panelName].data.datasets[0].data) !== JSON.stringify(panelChartCounts)) {
|
||||
chartInstances[panelName].data.datasets[0].data = panelChartCounts;
|
||||
chartInstances[panelName].update('none');
|
||||
}
|
||||
} else {
|
||||
let canvas = document.getElementById(`chart-${context}-${panelName}`); // Use context in ID
|
||||
if (canvas) {
|
||||
console.log(`Recreating ${context} chart instance for panel (visible): ${panelName}`);
|
||||
const ctx = canvas.getContext('2d');
|
||||
chartInstances[panelName] = new Chart(ctx, createChartConfig(panelChartCounts, panelTotal, context, panelName));
|
||||
} else {
|
||||
console.log(`Creating new ${context} panel elements and chart (visible) for: ${panelName}`);
|
||||
const chartContainer = document.createElement('div');
|
||||
chartContainer.id = `chart-container-${context}-${panelName}`; // Use context in ID
|
||||
chartContainer.className = 'chart-container';
|
||||
const label = document.createElement('span');
|
||||
label.className = 'chart-label'; label.textContent = panelName;
|
||||
canvas = document.createElement('canvas'); // Reassign canvas variable
|
||||
canvas.id = `chart-${context}-${panelName}`; // Use context in ID
|
||||
canvas.className = 'panel-chart-canvas';
|
||||
chartContainer.appendChild(label);
|
||||
chartContainer.appendChild(canvas);
|
||||
// Added Log before append
|
||||
console.log(`[updatePanelCharts] Appending chartContainer (${chartContainer.id}) to panelsContainer (${panelsContainer ? panelsContainer.id : 'null'})`);
|
||||
panelsContainer.appendChild(chartContainer); // Append to the main panels progress div
|
||||
const ctx = canvas.getContext('2d');
|
||||
chartInstances[panelName] = new Chart(ctx, createChartConfig(panelChartCounts, panelTotal, context, panelName));
|
||||
}
|
||||
}
|
||||
}
|
||||
// --- End visibility check ---
|
||||
});
|
||||
} else {
|
||||
if (!panelsContainer.querySelector('p')) {
|
||||
panelsContainer.innerHTML = '<p class="text-center fst-italic">No panel data available yet.</p>';
|
||||
}
|
||||
}
|
||||
|
||||
existingInstanceNames.forEach(panelName => {
|
||||
if (!incomingPanelNames.has(panelName)) {
|
||||
console.log(`Removing ${context} panel elements and chart for: ${panelName}`);
|
||||
// Ensure chart is destroyed before removing element
|
||||
if (chartInstances[panelName]) {
|
||||
chartInstances[panelName].destroy();
|
||||
delete chartInstances[panelName];
|
||||
}
|
||||
const chartElement = document.getElementById(`chart-container-${context}-${panelName}`); // Use context
|
||||
if (chartElement) {
|
||||
chartElement.remove();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// --- Generic Helper to create chart config --- Needs context ---
|
||||
function createChartConfig(chartCounts, total, context, identifier) { // identifier is 'overall' or panelName
|
||||
const labels = context === 'scada' ? scadaChartLabels : drawingChartLabels;
|
||||
const colors = context === 'scada' ? scadaChartColors : drawingChartColors;
|
||||
const datasetLabel = context === 'scada' ? 'SCADA Match' : 'Drawing Match';
|
||||
|
||||
return {
|
||||
type: 'pie',
|
||||
data: {
|
||||
labels: chartLabels,
|
||||
labels: labels,
|
||||
datasets: [{
|
||||
label: 'Overall Aliases',
|
||||
data: overallChartCounts,
|
||||
backgroundColor: chartColors,
|
||||
label: datasetLabel,
|
||||
data: chartCounts,
|
||||
backgroundColor: colors,
|
||||
hoverOffset: 4
|
||||
}]
|
||||
},
|
||||
options: {
|
||||
responsive: true,
|
||||
maintainAspectRatio: false,
|
||||
onClick: handleChartClick,
|
||||
onClick: (event, elements, chart) => handleChartClick(event, elements, chart, context), // Pass context
|
||||
plugins: {
|
||||
legend: { display: false },
|
||||
tooltip: {
|
||||
callbacks: {
|
||||
label: function(context) {
|
||||
let label = context.label || '';
|
||||
label: function(ctxTooltip) {
|
||||
let label = ctxTooltip.label || '';
|
||||
if (label) label += ': ';
|
||||
const value = context.parsed;
|
||||
const value = ctxTooltip.parsed;
|
||||
if (value !== null) label += value;
|
||||
if (overallTotal > 0) {
|
||||
label += ` (${((value / overallTotal) * 100).toFixed(1)}%)`;
|
||||
// Use overallTotal for overall chart, panelTotal otherwise (How to get panelTotal here? Needs rethinking)
|
||||
// Workaround: Don't show percentage on panel tooltips for now
|
||||
const chartTotal = (identifier === 'overall' && progressDetailsData.overall) ? progressDetailsData.overall.total_csv : null;
|
||||
if (chartTotal && chartTotal > 0) {
|
||||
label += ` (${((value / chartTotal) * 100).toFixed(1)}%)`;
|
||||
}
|
||||
return label;
|
||||
}
|
||||
@ -194,192 +482,168 @@
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const overallCanvas = document.getElementById('overall-chart-canvas');
|
||||
if (chartInstances['overall']) {
|
||||
chartInstances['overall'].data = overallChartConfig.data;
|
||||
chartInstances['overall'].update();
|
||||
} else if (overallCanvas) {
|
||||
const ctxOverall = overallCanvas.getContext('2d');
|
||||
chartInstances['overall'] = new Chart(ctxOverall, overallChartConfig);
|
||||
}
|
||||
|
||||
// --- Update Panel Charts ---
|
||||
const panelsContainer = document.getElementById('panels-progress');
|
||||
const panelsData = progressDetailsData.panels;
|
||||
const sortedPanels = Object.keys(panelsData).sort();
|
||||
const currentPanelsOnPage = new Set(Object.keys(chartInstances).filter(k => k !== 'overall'));
|
||||
const incomingPanels = new Set(sortedPanels);
|
||||
|
||||
// Remove charts for panels no longer present
|
||||
currentPanelsOnPage.forEach(panelName => {
|
||||
if (!incomingPanels.has(panelName)) {
|
||||
if(chartInstances[panelName]) { chartInstances[panelName].destroy(); delete chartInstances[panelName]; }
|
||||
const chartElement = document.getElementById(`chart-container-${panelName}`);
|
||||
if (chartElement) chartElement.remove();
|
||||
}
|
||||
});
|
||||
|
||||
// Update or create charts for current panels
|
||||
if (sortedPanels.length === 0) {
|
||||
panelsContainer.innerHTML = '<p>No panel data available yet.</p>';
|
||||
} else {
|
||||
// Remove loading message if it exists
|
||||
const loadingMsg = panelsContainer.querySelector('p');
|
||||
if (loadingMsg && loadingMsg.textContent.includes('Loading')) { loadingMsg.remove(); }
|
||||
|
||||
sortedPanels.forEach(panelName => {
|
||||
const panel = panelsData[panelName];
|
||||
const panelTotal = panel.total;
|
||||
const panelChartCounts = [
|
||||
panel.found_both,
|
||||
panel.found_scada_only,
|
||||
panel.found_drawing_only,
|
||||
panel.missing_both
|
||||
];
|
||||
|
||||
let chartContainer = document.getElementById(`chart-container-${panelName}`);
|
||||
let canvas = document.getElementById(`chart-${panelName}`);
|
||||
|
||||
// Create container and canvas if they don't exist
|
||||
if (!chartContainer) {
|
||||
chartContainer = document.createElement('div');
|
||||
chartContainer.id = `chart-container-${panelName}`;
|
||||
chartContainer.className = 'chart-container';
|
||||
const label = document.createElement('span');
|
||||
label.className = 'chart-label'; label.textContent = panelName;
|
||||
canvas = document.createElement('canvas');
|
||||
canvas.id = `chart-${panelName}`;
|
||||
canvas.className = 'panel-chart-canvas';
|
||||
chartContainer.appendChild(label);
|
||||
chartContainer.appendChild(canvas);
|
||||
panelsContainer.appendChild(chartContainer);
|
||||
}
|
||||
|
||||
const panelChartConfig = {
|
||||
type: 'pie',
|
||||
data: {
|
||||
labels: chartLabels,
|
||||
datasets: [{
|
||||
label: 'Aliases',
|
||||
data: panelChartCounts,
|
||||
backgroundColor: chartColors,
|
||||
hoverOffset: 4
|
||||
}]
|
||||
},
|
||||
options: {
|
||||
responsive: true,
|
||||
maintainAspectRatio: false,
|
||||
onClick: handleChartClick,
|
||||
plugins: {
|
||||
legend: { display: false },
|
||||
tooltip: {
|
||||
callbacks: {
|
||||
label: function(context) {
|
||||
let label = context.label || '';
|
||||
if (label) label += ': ';
|
||||
const value = context.parsed;
|
||||
if (value !== null) label += value;
|
||||
if (panelTotal > 0) {
|
||||
label += ` (${((value / panelTotal) * 100).toFixed(1)}%)`;
|
||||
}
|
||||
return label;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Update existing chart or create new one
|
||||
if (chartInstances[panelName]) {
|
||||
chartInstances[panelName].data = panelChartConfig.data;
|
||||
chartInstances[panelName].update();
|
||||
} else if (canvas) {
|
||||
const ctx = canvas.getContext('2d');
|
||||
chartInstances[panelName] = new Chart(ctx, panelChartConfig);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// --- Modal Display Function (Heavily Updated) ---
|
||||
function showDetailsModal(identifier, listKey) {
|
||||
// --- Wrapper function called by debouncer (Handles all sections) ---
|
||||
function processUpdate(data) {
|
||||
console.log("Processing update for commit:", data.last_commit);
|
||||
|
||||
// Always update status bar and commit hash text immediately
|
||||
document.getElementById('status-message').textContent = data.status;
|
||||
document.getElementById('last-commit').textContent = data.last_commit || 'N/A';
|
||||
|
||||
// *** Strict Check: Only proceed if commit hash has changed ***
|
||||
if (data.last_commit && data.last_commit !== previousCommitHash) {
|
||||
console.log("Commit hash changed (" + (previousCommitHash || 'None') + " -> " + data.last_commit + ") or initial load. Queueing core redraw.");
|
||||
previousCommitHash = data.last_commit;
|
||||
// Defer the core UI update calls
|
||||
setTimeout(() => {
|
||||
// Update all sections - they have internal checks/efficiency
|
||||
updateUIScadaCore(data);
|
||||
updateUIDrawingCore(data);
|
||||
updateUIConflictsCore(data);
|
||||
}, 0);
|
||||
} else {
|
||||
console.log("Commit hash unchanged (" + previousCommitHash + "), skipping core UI redraw.");
|
||||
}
|
||||
}
|
||||
|
||||
// --- Debounced version of the processing function ---
|
||||
const debouncedProcessUpdate = debounce(processUpdate, 250); // Single debouncer
|
||||
|
||||
// --- Modal Display Function (Needs context) ---
|
||||
function showDetailsModal(identifier, categoryType, context) { // Added context
|
||||
let sourceData = null;
|
||||
let panelNameDisplay = ""; // Name to show in the title
|
||||
const listTypeLabel = chartLabels[listKeys.indexOf(listKey)] || "Details"; // Get nice label
|
||||
let panelNameDisplay = "";
|
||||
const listKeysMap = context === 'scada' ? scadaListKeysMap : drawingListKeysMap;
|
||||
const listTypeLabel = categoryType === 'found'
|
||||
? (context === 'scada' ? 'Found in SCADA' : 'Found in Drawing')
|
||||
: (context === 'scada' ? 'Not Found in SCADA' : 'Not Found in Drawing');
|
||||
|
||||
if (identifier === '__overall__') {
|
||||
sourceData = progressDetailsData.overall;
|
||||
panelNameDisplay = "Overall";
|
||||
} else {
|
||||
sourceData = progressDetailsData.panels[identifier];
|
||||
panelNameDisplay = identifier; // Use panel name from identifier
|
||||
sourceData = progressDetailsData.panels ? progressDetailsData.panels[identifier] : null;
|
||||
panelNameDisplay = identifier;
|
||||
}
|
||||
|
||||
if (!sourceData || !sourceData[listKey]) {
|
||||
console.error("Data list not found for:", identifier, listKey);
|
||||
alert(`Could not find data for ${listTypeLabel} in ${panelNameDisplay}.`);
|
||||
return;
|
||||
}
|
||||
if (!sourceData) { /* ... error handling ... */ return; }
|
||||
|
||||
const dataList = sourceData[listKey];
|
||||
const backendListKeys = listKeysMap[categoryType];
|
||||
if (!backendListKeys) { /* ... error handling ... */ return; }
|
||||
|
||||
if (!dataList || dataList.length === 0) {
|
||||
console.log(`No items to show for:`, panelNameDisplay, listKey);
|
||||
alert(`No ${listTypeLabel} items found for ${panelNameDisplay}.`);
|
||||
return;
|
||||
}
|
||||
let combinedDataList = [];
|
||||
backendListKeys.forEach(key => {
|
||||
if (sourceData[key]) {
|
||||
combinedDataList = combinedDataList.concat(sourceData[key]);
|
||||
}
|
||||
});
|
||||
|
||||
if (combinedDataList.length === 0) { /* ... alert handling ... */ return; }
|
||||
|
||||
const modalTitleElement = document.getElementById('detailsModalLabel');
|
||||
const modalTableBody = document.querySelector('#detailsModal .modal-body tbody');
|
||||
|
||||
// Update modal title dynamically
|
||||
modalTitleElement.innerHTML = `${listTypeLabel} Items for ${panelNameDisplay} <span class="badge bg-secondary ms-2">${dataList.length}</span>`;
|
||||
modalTitleElement.innerHTML = `${listTypeLabel} Items for ${panelNameDisplay} <span class="badge bg-secondary ms-2">${combinedDataList.length}</span>`;
|
||||
modalTableBody.innerHTML = '';
|
||||
|
||||
modalTableBody.innerHTML = ''; // Clear previous entries
|
||||
|
||||
// Populate table rows with detailed info
|
||||
dataList.forEach(item => {
|
||||
combinedDataList.sort((a, b) => a.alias.localeCompare(b.alias)).forEach(item => {
|
||||
const row = document.createElement('tr');
|
||||
|
||||
row.insertCell().textContent = item.alias;
|
||||
row.insertCell().textContent = item.control_panel;
|
||||
|
||||
// SCADA Status Cell
|
||||
const scadaCell = row.insertCell();
|
||||
scadaCell.innerHTML = item.found_scada
|
||||
? '<span class="status-yes">Yes</span>'
|
||||
: '<span class="status-no">No</span>';
|
||||
|
||||
// Drawing Status Cell
|
||||
const drawingCell = row.insertCell();
|
||||
drawingCell.innerHTML = item.found_drawing
|
||||
? '<span class="status-yes">Yes</span>'
|
||||
: '<span class="status-no">No</span>';
|
||||
|
||||
row.insertCell().textContent = item.expected_drawing_filename || 'N/A';
|
||||
const scadaCell = row.insertCell(); scadaCell.innerHTML = item.found_scada ? '<span class="status-yes">Yes</span>' : '<span class="status-no">No</span>';
|
||||
const drawingCell = row.insertCell(); drawingCell.innerHTML = item.found_drawing ? '<span class="status-yes">Yes</span>' : '<span class="status-no">No</span>';
|
||||
row.insertCell().textContent = item.equipment_type || 'N/A';
|
||||
row.insertCell().textContent = item.conveyor_type || 'N/A';
|
||||
|
||||
if (item.found_scada && !item.found_drawing) { row.classList.add('table-warning'); }
|
||||
modalTableBody.appendChild(row);
|
||||
});
|
||||
|
||||
// Initialize and show modal
|
||||
if (!detailsModalInstance) {
|
||||
detailsModalInstance = new bootstrap.Modal(document.getElementById('detailsModal'));
|
||||
}
|
||||
detailsModalInstance.show();
|
||||
}
|
||||
|
||||
// --- Connect to SSE stream (Unchanged) ---
|
||||
// --- Navigation Handling ---
|
||||
function showSection(sectionId) {
|
||||
console.log("Showing section:", sectionId);
|
||||
document.getElementById('scada-content').style.display = 'none';
|
||||
document.getElementById('drawings-content').style.display = 'none';
|
||||
document.getElementById('conflicts-content').style.display = 'none';
|
||||
|
||||
const elementToShow = document.getElementById(`${sectionId}-content`);
|
||||
if (elementToShow) {
|
||||
elementToShow.style.display = 'block';
|
||||
currentVisibleSection = sectionId;
|
||||
|
||||
// --- Trigger update for the now-visible section ---
|
||||
// The update function will check visibility internally before drawing charts.
|
||||
if (progressDetailsData && Object.keys(progressDetailsData).length > 0) {
|
||||
const updateData = { progress: progressDetailsData }; // Pass existing data
|
||||
console.log(`Calling update function for now-visible section: ${sectionId}`);
|
||||
// Use setTimeout to ensure DOM update (display: block) is processed first
|
||||
if (sectionId === 'scada') {
|
||||
updateUIScadaCore(updateData);
|
||||
} else if (sectionId === 'drawings') {
|
||||
updateUIDrawingCore(updateData);
|
||||
} else if (sectionId === 'conflicts') {
|
||||
updateUIConflictsCore(updateData);
|
||||
}
|
||||
} else {
|
||||
console.log(`Section ${sectionId} shown, but no progress data yet.`);
|
||||
// If data arrives later, the debouncedProcessUpdate will handle drawing
|
||||
// for the currently visible section.
|
||||
}
|
||||
// --- End section update trigger ---
|
||||
|
||||
} else {
|
||||
console.error("Attempted to show unknown section:", sectionId);
|
||||
document.getElementById('scada-content').style.display = 'block'; // Default back to SCADA
|
||||
currentVisibleSection = 'scada';
|
||||
}
|
||||
|
||||
// Update active nav link
|
||||
document.querySelectorAll('.nav-link').forEach(link => {
|
||||
link.classList.remove('active');
|
||||
// Use href attribute to match sectionId
|
||||
const targetSection = link.getAttribute('data-target-section');
|
||||
if (targetSection === sectionId) {
|
||||
link.classList.add('active');
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
console.log("DOM Loaded, setting up navigation...");
|
||||
document.querySelectorAll('.nav-link').forEach(link => {
|
||||
// Store target section ID in a data attribute from href
|
||||
const href = link.getAttribute('href');
|
||||
let targetSection = 'scada'; // Default
|
||||
if (href === '/drawings') targetSection = 'drawings'; // Use plural to match ID
|
||||
else if (href === '/conflicts') targetSection = 'conflicts'; // Use plural to match ID
|
||||
link.setAttribute('data-target-section', targetSection);
|
||||
|
||||
link.addEventListener('click', (event) => {
|
||||
event.preventDefault(); // Prevent page reload
|
||||
const sectionId = link.getAttribute('data-target-section');
|
||||
showSection(sectionId);
|
||||
});
|
||||
});
|
||||
|
||||
// Show initial section (SCADA by default)
|
||||
showSection('scada');
|
||||
});
|
||||
|
||||
// --- Connect to SSE stream (Single connection) ---
|
||||
console.log("Initializing SSE connection...");
|
||||
const eventSource = new EventSource("/stream");
|
||||
|
||||
eventSource.onmessage = function(event) {
|
||||
console.log("SSE message received:", event.data);
|
||||
try {
|
||||
const data = JSON.parse(event.data);
|
||||
updateUI(data); // Call the UI update function with the new data
|
||||
debouncedProcessUpdate(data); // Call the single debounced processor
|
||||
} catch (error) {
|
||||
console.error("Error parsing SSE data:", error);
|
||||
document.getElementById('status-message').textContent = 'Error processing update from server.';
|
||||
@ -389,10 +653,9 @@
|
||||
eventSource.onerror = function(err) {
|
||||
console.error("EventSource failed:", err);
|
||||
document.getElementById('status-message').textContent = 'Connection to server lost. Retrying...';
|
||||
// Note: browser usually attempts reconnection automatically
|
||||
};
|
||||
|
||||
// No need for initial fetch here, SSE stream sends initial state on connect
|
||||
console.log("SSE handler set up.");
|
||||
|
||||
</script>
|
||||
</body>
|
||||
|
||||
@ -0,0 +1 @@
|
||||
pip
|
||||
173
venv/lib/python3.12/site-packages/pypdf-5.4.0.dist-info/METADATA
Normal file
173
venv/lib/python3.12/site-packages/pypdf-5.4.0.dist-info/METADATA
Normal file
@ -0,0 +1,173 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: pypdf
|
||||
Version: 5.4.0
|
||||
Summary: A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files
|
||||
Author-email: Mathieu Fenniak <biziqe@mathieu.fenniak.net>
|
||||
Maintainer: stefan6419846
|
||||
Maintainer-email: Martin Thoma <info@martin-thoma.de>
|
||||
Requires-Python: >=3.8
|
||||
Description-Content-Type: text/markdown
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: BSD License
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3.12
|
||||
Classifier: Programming Language :: Python :: 3.13
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Typing :: Typed
|
||||
License-File: LICENSE
|
||||
Requires-Dist: typing_extensions >= 4.0; python_version < '3.11'
|
||||
Requires-Dist: cryptography ; extra == "crypto"
|
||||
Requires-Dist: PyCryptodome ; extra == "cryptodome"
|
||||
Requires-Dist: black ; extra == "dev"
|
||||
Requires-Dist: flit ; extra == "dev"
|
||||
Requires-Dist: pip-tools ; extra == "dev"
|
||||
Requires-Dist: pre-commit<2.18.0 ; extra == "dev"
|
||||
Requires-Dist: pytest-cov ; extra == "dev"
|
||||
Requires-Dist: pytest-socket ; extra == "dev"
|
||||
Requires-Dist: pytest-timeout ; extra == "dev"
|
||||
Requires-Dist: pytest-xdist ; extra == "dev"
|
||||
Requires-Dist: wheel ; extra == "dev"
|
||||
Requires-Dist: myst_parser ; extra == "docs"
|
||||
Requires-Dist: sphinx ; extra == "docs"
|
||||
Requires-Dist: sphinx_rtd_theme ; extra == "docs"
|
||||
Requires-Dist: cryptography ; extra == "full"
|
||||
Requires-Dist: Pillow>=8.0.0 ; extra == "full"
|
||||
Requires-Dist: Pillow>=8.0.0 ; extra == "image"
|
||||
Project-URL: Bug Reports, https://github.com/py-pdf/pypdf/issues
|
||||
Project-URL: Changelog, https://pypdf.readthedocs.io/en/latest/meta/CHANGELOG.html
|
||||
Project-URL: Documentation, https://pypdf.readthedocs.io/en/latest/
|
||||
Project-URL: Source, https://github.com/py-pdf/pypdf
|
||||
Provides-Extra: crypto
|
||||
Provides-Extra: cryptodome
|
||||
Provides-Extra: dev
|
||||
Provides-Extra: docs
|
||||
Provides-Extra: full
|
||||
Provides-Extra: image
|
||||
|
||||
[](https://badge.fury.io/py/pypdf)
|
||||
[](https://pypi.org/project/pypdf/)
|
||||
[](https://pypdf.readthedocs.io/en/stable/)
|
||||
[](https://github.com/py-pdf/pypdf)
|
||||
[](https://codecov.io/gh/py-pdf/pypdf)
|
||||
|
||||
# pypdf
|
||||
|
||||
pypdf is a free and open-source pure-python PDF library capable of splitting,
|
||||
[merging](https://pypdf.readthedocs.io/en/stable/user/merging-pdfs.html),
|
||||
[cropping, and transforming](https://pypdf.readthedocs.io/en/stable/user/cropping-and-transforming.html)
|
||||
the pages of PDF files. It can also add
|
||||
custom data, viewing options, and
|
||||
[passwords](https://pypdf.readthedocs.io/en/stable/user/encryption-decryption.html)
|
||||
to PDF files. pypdf can
|
||||
[retrieve text](https://pypdf.readthedocs.io/en/stable/user/extract-text.html)
|
||||
and
|
||||
[metadata](https://pypdf.readthedocs.io/en/stable/user/metadata.html)
|
||||
from PDFs as well.
|
||||
|
||||
See [pdfly](https://github.com/py-pdf/pdfly) for a CLI application that uses pypdf to interact with PDFs.
|
||||
|
||||
## Installation
|
||||
|
||||
Install pypdf using pip:
|
||||
|
||||
```
|
||||
pip install pypdf
|
||||
```
|
||||
|
||||
For using pypdf with AES encryption or decryption, install extra dependencies:
|
||||
|
||||
```
|
||||
pip install pypdf[crypto]
|
||||
```
|
||||
|
||||
> **NOTE**: `pypdf` 3.1.0 and above include significant improvements compared to
|
||||
> previous versions. Please refer to [the migration
|
||||
> guide](https://pypdf.readthedocs.io/en/latest/user/migration-1-to-2.html) for
|
||||
> more information.
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader("example.pdf")
|
||||
number_of_pages = len(reader.pages)
|
||||
page = reader.pages[0]
|
||||
text = page.extract_text()
|
||||
```
|
||||
|
||||
pypdf can do a lot more, e.g. splitting, merging, reading and creating
|
||||
annotations, decrypting and encrypting, and more. Check out [the
|
||||
documentation](https://pypdf.readthedocs.io/en/stable/) for additional usage
|
||||
examples!
|
||||
|
||||
For questions and answers, visit
|
||||
[StackOverflow](https://stackoverflow.com/questions/tagged/pypdf)
|
||||
(tagged with [pypdf](https://stackoverflow.com/questions/tagged/pypdf)).
|
||||
|
||||
## Contributions
|
||||
|
||||
Maintaining pypdf is a collaborative effort. You can support the project by
|
||||
writing documentation, helping to narrow down issues, and submitting code.
|
||||
See the [CONTRIBUTING.md](https://github.com/py-pdf/pypdf/blob/main/CONTRIBUTING.md) file for more information.
|
||||
|
||||
### Q&A
|
||||
|
||||
The experience pypdf users have covers the whole range from beginners who
|
||||
want to make their live easier to experts who developed software before PDF
|
||||
existed. You can contribute to the pypdf community by answering questions
|
||||
on [StackOverflow](https://stackoverflow.com/questions/tagged/pypdf),
|
||||
helping in [discussions](https://github.com/py-pdf/pypdf/discussions),
|
||||
and asking users who report issues for [MCVE](https://stackoverflow.com/help/minimal-reproducible-example)'s (Code + example PDF!).
|
||||
|
||||
|
||||
### Issues
|
||||
|
||||
A good bug ticket includes a MCVE - a minimal complete verifiable example.
|
||||
For pypdf, this means that you must upload a PDF that causes the bug to occur
|
||||
as well as the code you're executing with all of the output. Use
|
||||
`print(pypdf.__version__)` to tell us which version you're using.
|
||||
|
||||
### Code
|
||||
|
||||
All code contributions are welcome, but smaller ones have a better chance to
|
||||
get included in a timely manner. Adding unit tests for new features or test
|
||||
cases for bugs you've fixed help us to ensure that the Pull Request (PR) is fine.
|
||||
|
||||
pypdf includes a test suite which can be executed with `pytest`:
|
||||
|
||||
```bash
|
||||
$ pytest
|
||||
===================== test session starts =====================
|
||||
platform linux -- Python 3.6.15, pytest-7.0.1, pluggy-1.0.0
|
||||
rootdir: /home/moose/GitHub/Martin/pypdf
|
||||
plugins: cov-3.0.0
|
||||
collected 233 items
|
||||
|
||||
tests/test_basic_features.py .. [ 0%]
|
||||
tests/test_constants.py . [ 1%]
|
||||
tests/test_filters.py .................x..... [ 11%]
|
||||
tests/test_generic.py ................................. [ 25%]
|
||||
............. [ 30%]
|
||||
tests/test_javascript.py .. [ 31%]
|
||||
tests/test_merger.py . [ 32%]
|
||||
tests/test_page.py ......................... [ 42%]
|
||||
tests/test_pagerange.py ................ [ 49%]
|
||||
tests/test_papersizes.py .................. [ 57%]
|
||||
tests/test_reader.py .................................. [ 72%]
|
||||
............... [ 78%]
|
||||
tests/test_utils.py .................... [ 87%]
|
||||
tests/test_workflows.py .......... [ 91%]
|
||||
tests/test_writer.py ................. [ 98%]
|
||||
tests/test_xmp.py ... [100%]
|
||||
|
||||
========== 232 passed, 1 xfailed, 1 warning in 4.52s ==========
|
||||
```
|
||||
|
||||
113
venv/lib/python3.12/site-packages/pypdf-5.4.0.dist-info/RECORD
Normal file
113
venv/lib/python3.12/site-packages/pypdf-5.4.0.dist-info/RECORD
Normal file
@ -0,0 +1,113 @@
|
||||
pypdf-5.4.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
pypdf-5.4.0.dist-info/METADATA,sha256=E-D5PSflgLScgSvyNJcLdhpDBX4H0QUafueJFd7PDSA,7262
|
||||
pypdf-5.4.0.dist-info/RECORD,,
|
||||
pypdf-5.4.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
pypdf-5.4.0.dist-info/WHEEL,sha256=_2ozNFCLWc93bK4WKHCO-eDUENDlo-dgc9cU3qokYO4,82
|
||||
pypdf-5.4.0.dist-info/licenses/LICENSE,sha256=qXrCMOXzPvEKU2eoUOsB-R8aCwZONHQsd5TSKUVX9SQ,1605
|
||||
pypdf/__init__.py,sha256=WYkiisiLw4TrsrobuzUkEFGwAUbPF8V8ei_HJSdEJNY,1302
|
||||
pypdf/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_cmap.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_doc_common.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_encryption.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_merger.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_page.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_page_labels.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_protocols.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_reader.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_utils.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_version.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_writer.cpython-312.pyc,,
|
||||
pypdf/__pycache__/_xobj_image_helpers.cpython-312.pyc,,
|
||||
pypdf/__pycache__/constants.cpython-312.pyc,,
|
||||
pypdf/__pycache__/errors.cpython-312.pyc,,
|
||||
pypdf/__pycache__/filters.cpython-312.pyc,,
|
||||
pypdf/__pycache__/pagerange.cpython-312.pyc,,
|
||||
pypdf/__pycache__/papersizes.cpython-312.pyc,,
|
||||
pypdf/__pycache__/types.cpython-312.pyc,,
|
||||
pypdf/__pycache__/xmp.cpython-312.pyc,,
|
||||
pypdf/_cmap.py,sha256=Q4_EJC73QZ-0_I4jtLeHD-rkT5GASW9zehhNcums_0A,18642
|
||||
pypdf/_codecs/__init__.py,sha256=WXMkzlMCDlmG5U6ixQk8MrYxaQeJxEfig5DTaGlklLk,1676
|
||||
pypdf/_codecs/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/_codecs.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/adobe_glyphs.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/pdfdoc.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/std.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/symbol.cpython-312.pyc,,
|
||||
pypdf/_codecs/__pycache__/zapfding.cpython-312.pyc,,
|
||||
pypdf/_codecs/_codecs.py,sha256=zduPFkHbt9BjCpAc7Mx_rSOTEoSOZkUayr8EL5l82VM,9966
|
||||
pypdf/_codecs/adobe_glyphs.py,sha256=t3cDFPDqwIz1w9B0gdVzjdc8eEK9AuRjk5f7laEw_fY,447213
|
||||
pypdf/_codecs/pdfdoc.py,sha256=xfSvMFYsvxuaSQ0Uu9vZDKaB0Wu85h1uCiB1i9rAcUU,4269
|
||||
pypdf/_codecs/std.py,sha256=DyQMuEpAGEpS9uy1jWf4cnj-kqShPOAij5sI7Q1YD8E,2630
|
||||
pypdf/_codecs/symbol.py,sha256=nIaGQIlhWCJiPMHrwUlmGHH-_fOXyEKvguRmuKXcGAk,3734
|
||||
pypdf/_codecs/zapfding.py,sha256=PQxjxRC616d41xF3exVxP1W8nM4QrZfjO3lmtLxpE_s,3742
|
||||
pypdf/_crypt_providers/__init__.py,sha256=K3Z6AuXhXVeXgLet-Tukq2gt9H66OgdupsvxIS1CmkI,3054
|
||||
pypdf/_crypt_providers/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/_crypt_providers/__pycache__/_base.cpython-312.pyc,,
|
||||
pypdf/_crypt_providers/__pycache__/_cryptography.cpython-312.pyc,,
|
||||
pypdf/_crypt_providers/__pycache__/_fallback.cpython-312.pyc,,
|
||||
pypdf/_crypt_providers/__pycache__/_pycryptodome.cpython-312.pyc,,
|
||||
pypdf/_crypt_providers/_base.py,sha256=_f53Mj6vivhEZMQ4vNxN5G0IOgFY-n5_leke0c_qiNU,1711
|
||||
pypdf/_crypt_providers/_cryptography.py,sha256=zT3WmbPzesvgHRkGcKAldqJ24MY3BwZViVbSc55Zxhw,4557
|
||||
pypdf/_crypt_providers/_fallback.py,sha256=vsYoowR1YCAV_q-HrdIZhkUcrCb6HvRBNMYm03QtCU8,3334
|
||||
pypdf/_crypt_providers/_pycryptodome.py,sha256=U1aQZ9iYBrZo-hKCjJUhGOPhwEFToiitowQ316TNrrA,3381
|
||||
pypdf/_doc_common.py,sha256=lyM-6je3IbNfzL6gfYdFU2VvX3pkxj5AWHcEZRCFMQk,51871
|
||||
pypdf/_encryption.py,sha256=pPg7fIfqdL96Tc6RVoBytEVjMrmZFecr_6l7dbtDFrE,48775
|
||||
pypdf/_merger.py,sha256=YfSQKDiiQz2WtCmVZjxP_nv2pR2shiBf2tDiAb41c7s,1744
|
||||
pypdf/_page.py,sha256=6Pts2harKZyD_qhKdbNjWLwy07Gw0QLTWIf_fAMENaA,102235
|
||||
pypdf/_page_labels.py,sha256=nEU0knE7IRQ6LPhzwgw1RjJgm8WxXIfkmiHuv7ep2ow,8546
|
||||
pypdf/_protocols.py,sha256=noE1y2fVE-z1wq-FkQzaS5exa8ovOFTUXqdQSvqi57c,2142
|
||||
pypdf/_reader.py,sha256=tf8l66t8DmoeuZviN2YOdFHAwahnTu92ABAXiK9zCUA,51503
|
||||
pypdf/_text_extraction/__init__.py,sha256=0zxSe5aXqO15dpOg5Q24FawupoTbvJCiHfBzGsWgpJE,8556
|
||||
pypdf/_text_extraction/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/_layout_mode/__init__.py,sha256=k1tN46gDX1zhAatD8oTGMuCJUp-pgbHjyQ8H6axXRgU,338
|
||||
pypdf/_text_extraction/_layout_mode/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/_layout_mode/__pycache__/_fixed_width_page.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/_layout_mode/__pycache__/_font.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/_layout_mode/__pycache__/_font_widths.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/_layout_mode/__pycache__/_text_state_manager.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/_layout_mode/__pycache__/_text_state_params.cpython-312.pyc,,
|
||||
pypdf/_text_extraction/_layout_mode/_fixed_width_page.py,sha256=xXC6BwQvrOXMZmSKQ6UPnPtCnjjZ9jCCWTbEJ35E3ko,15424
|
||||
pypdf/_text_extraction/_layout_mode/_font.py,sha256=F0uvly32AcFeTE4jBFg7JvuAQZSMUjO6HZgQYYFDQ40,7048
|
||||
pypdf/_text_extraction/_layout_mode/_font_widths.py,sha256=Hfgsd2ftGw8Ajl7IcwNIlfLYnum-ekaadfwErcUdWtI,4265
|
||||
pypdf/_text_extraction/_layout_mode/_text_state_manager.py,sha256=ugOJRALDNXW3snNAjKKKT8xmWt7D3GZZbcMVaGuVfFM,7989
|
||||
pypdf/_text_extraction/_layout_mode/_text_state_params.py,sha256=b8DSoJ2easCZW_JvMl84WFFIANKGhLD1zjMVAlqScyU,5318
|
||||
pypdf/_utils.py,sha256=h97CvvcQpxq7px__GzaMGzJWqJGZt2FYsZYR6wFiU3w,19300
|
||||
pypdf/_version.py,sha256=xjYaBGUFGg0kGZj_WhuoFyPD8NILPsr79SaMwmYQGSg,22
|
||||
pypdf/_writer.py,sha256=Kjrk1_uMUyZBlsze0qQhS-We90GIk3WtclKLzb373-s,128663
|
||||
pypdf/_xobj_image_helpers.py,sha256=KVC80bgNcHBdqGEOfQbmQO4in6Foayt_lPTgSOgb-BA,14020
|
||||
pypdf/annotations/__init__.py,sha256=f2k_-jAn39CCB27KxQ_e93GinnzkAHbUnnSeGJl1jyE,990
|
||||
pypdf/annotations/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/annotations/__pycache__/_base.cpython-312.pyc,,
|
||||
pypdf/annotations/__pycache__/_markup_annotations.cpython-312.pyc,,
|
||||
pypdf/annotations/__pycache__/_non_markup_annotations.cpython-312.pyc,,
|
||||
pypdf/annotations/_base.py,sha256=7rQJyOMPtKkd_Yp2CXGT6KN17W3WOj8Albx6ehMki3w,916
|
||||
pypdf/annotations/_markup_annotations.py,sha256=F4qMyS15OqXNLL9OTR5Wj2_4vO7ScG60yqNh-wayIFQ,10116
|
||||
pypdf/annotations/_non_markup_annotations.py,sha256=qX51TJMTRUyWz1ogIK-cXXGK7k5oKhgYQhemA_sVxGE,3622
|
||||
pypdf/constants.py,sha256=gwFz97ZB5j0Nn5R7LbWBUqBOcyEjIQRV7O598eLZSKc,20959
|
||||
pypdf/errors.py,sha256=x0J5mTIbp5YcXA1pdYa5DO83uAhXP5NCO0Ankf4DsUY,1740
|
||||
pypdf/filters.py,sha256=hT6e4odOa6WTpXYOxRm2r6fYOS2cocLsVdNPpjEPhn8,27869
|
||||
pypdf/generic/__init__.py,sha256=nnLmD7bnhSJu1qZ774pj0eE7lmeRuYDEUcpa52-Mk5A,7168
|
||||
pypdf/generic/__pycache__/__init__.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_base.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_data_structures.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_files.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_fit.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_image_inline.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_outline.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_rectangle.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_utils.cpython-312.pyc,,
|
||||
pypdf/generic/__pycache__/_viewerpref.cpython-312.pyc,,
|
||||
pypdf/generic/_base.py,sha256=u8oX747OyUZ5KPG8IYWUGD6lgeL-_MzWX0J-LsY0DjA,30885
|
||||
pypdf/generic/_data_structures.py,sha256=kqIGv06r3p0BeUrmTePeFrEoB4v0LiulDvEkTt45TN8,63998
|
||||
pypdf/generic/_files.py,sha256=UcyL_mCDpVh_dRuxxH8bENWA76rYt5eFw0emFcOE79Y,5655
|
||||
pypdf/generic/_fit.py,sha256=lLkLgW0AQ36sVG4py-HXV__EPQYkLA1bNLoCwGJ_ijI,5511
|
||||
pypdf/generic/_image_inline.py,sha256=OyP1GDpg-zgH-UWA--vsLIUriV_07-VqpFZ9mL31vl8,11447
|
||||
pypdf/generic/_outline.py,sha256=qKbMX42OWfqnopIiE6BUy6EvdTLGe3ZtjaiWN85JpaY,1094
|
||||
pypdf/generic/_rectangle.py,sha256=5KJRbQESqdzrYvJOFcwfp0_v_bhCDVj9r4yMyGXSGyc,3808
|
||||
pypdf/generic/_utils.py,sha256=8T_2fGpRt9tZpN-06fa-7Wma9gFAkdtgJW2SuD7Yqfk,7415
|
||||
pypdf/generic/_viewerpref.py,sha256=40YdivA2MAW6hTZEB-b_8Y84-tlNJNwXEusPmHMgS64,6739
|
||||
pypdf/pagerange.py,sha256=9QqjrP6VrR2m8BN_sbbjZQ8Fi476xPpRiKqd8CxGoKM,6996
|
||||
pypdf/papersizes.py,sha256=6Tz5sfNN_3JOUapY83U-lakohnpXYA0hSEQNmOVLFL8,1413
|
||||
pypdf/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
pypdf/types.py,sha256=6B6pMncEhcqFfq-iKs5IBPg6guWXffU6YHpeYzCJH-s,1963
|
||||
pypdf/xmp.py,sha256=0G9Gmb5lc7jdcGG-MYDSxYPg5P7SU_RswVRipuDY7lU,14246
|
||||
@ -0,0 +1,4 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: flit 3.11.0
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
@ -0,0 +1,29 @@
|
||||
Copyright (c) 2006-2008, Mathieu Fenniak
|
||||
Some contributions copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
|
||||
Some contributions copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* The name of the author may not be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
49
venv/lib/python3.12/site-packages/pypdf/__init__.py
Normal file
49
venv/lib/python3.12/site-packages/pypdf/__init__.py
Normal file
@ -0,0 +1,49 @@
|
||||
"""
|
||||
pypdf is a free and open-source pure-python PDF library capable of splitting,
|
||||
merging, cropping, and transforming the pages of PDF files. It can also add
|
||||
custom data, viewing options, and passwords to PDF files. pypdf can retrieve
|
||||
text and metadata from PDFs as well.
|
||||
|
||||
You can read the full docs at https://pypdf.readthedocs.io/.
|
||||
"""
|
||||
|
||||
from ._crypt_providers import crypt_provider
|
||||
from ._doc_common import DocumentInformation
|
||||
from ._encryption import PasswordType
|
||||
from ._merger import PdfMerger
|
||||
from ._page import PageObject, Transformation, mult
|
||||
from ._reader import PdfReader
|
||||
from ._version import __version__
|
||||
from ._writer import ObjectDeletionFlag, PdfWriter
|
||||
from .constants import ImageType
|
||||
from .pagerange import PageRange, parse_filename_page_ranges
|
||||
from .papersizes import PaperSize
|
||||
|
||||
try:
|
||||
import PIL
|
||||
|
||||
pil_version = PIL.__version__
|
||||
except ImportError:
|
||||
pil_version = "none"
|
||||
|
||||
_debug_versions = (
|
||||
f"pypdf=={__version__}, {crypt_provider=}, PIL={pil_version}"
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"DocumentInformation",
|
||||
"ImageType",
|
||||
"ObjectDeletionFlag",
|
||||
"PageObject",
|
||||
"PageRange",
|
||||
"PaperSize",
|
||||
"PasswordType",
|
||||
"PdfMerger",
|
||||
"PdfReader",
|
||||
"PdfWriter",
|
||||
"Transformation",
|
||||
"__version__",
|
||||
"_debug_versions",
|
||||
"mult",
|
||||
"parse_filename_page_ranges",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
544
venv/lib/python3.12/site-packages/pypdf/_cmap.py
Normal file
544
venv/lib/python3.12/site-packages/pypdf/_cmap.py
Normal file
@ -0,0 +1,544 @@
|
||||
import binascii
|
||||
from binascii import unhexlify
|
||||
from math import ceil
|
||||
from typing import Any, Dict, List, Tuple, Union, cast
|
||||
|
||||
from ._codecs import adobe_glyphs, charset_encoding
|
||||
from ._utils import logger_error, logger_warning
|
||||
from .generic import (
|
||||
DecodedStreamObject,
|
||||
DictionaryObject,
|
||||
StreamObject,
|
||||
is_null_or_none,
|
||||
)
|
||||
|
||||
|
||||
# code freely inspired from @twiggy ; see #711
|
||||
def build_char_map(
|
||||
font_name: str, space_width: float, obj: DictionaryObject
|
||||
) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]:
|
||||
"""
|
||||
Determine information about a font.
|
||||
|
||||
Args:
|
||||
font_name: font name as a string
|
||||
space_width: default space width if no data is found.
|
||||
obj: XObject or Page where you can find a /Resource dictionary
|
||||
|
||||
Returns:
|
||||
Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.
|
||||
The font-dictionary itself is suitable for the curious.
|
||||
|
||||
"""
|
||||
ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
|
||||
font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(
|
||||
space_width, ft
|
||||
)
|
||||
return font_subtype, font_halfspace, font_encoding, font_map, ft
|
||||
|
||||
|
||||
def build_char_map_from_dict(
|
||||
space_width: float, ft: DictionaryObject
|
||||
) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]:
|
||||
"""
|
||||
Determine information about a font.
|
||||
|
||||
Args:
|
||||
space_width: default space with if no data found
|
||||
(normally half the width of a character).
|
||||
ft: Font Dictionary
|
||||
|
||||
Returns:
|
||||
Font sub-type, space_width criteria(50% of width), encoding, map character-map.
|
||||
The font-dictionary itself is suitable for the curious.
|
||||
|
||||
"""
|
||||
font_type = cast(str, ft["/Subtype"].get_object())
|
||||
encoding, map_dict = get_encoding(ft)
|
||||
|
||||
space_key_char = get_actual_str_key(" ", encoding, map_dict)
|
||||
font_width_map = build_font_width_map(ft, space_width * 2.0)
|
||||
half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0
|
||||
|
||||
return (
|
||||
font_type,
|
||||
half_space_width,
|
||||
encoding,
|
||||
# https://github.com/python/mypy/issues/4374
|
||||
map_dict
|
||||
)
|
||||
|
||||
|
||||
# used when missing data, e.g. font def missing
|
||||
unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
|
||||
"Unknown",
|
||||
9999,
|
||||
dict.fromkeys(range(256), "<EFBFBD>"),
|
||||
{},
|
||||
)
|
||||
|
||||
|
||||
_predefined_cmap: Dict[str, str] = {
|
||||
"/Identity-H": "utf-16-be",
|
||||
"/Identity-V": "utf-16-be",
|
||||
"/GB-EUC-H": "gbk",
|
||||
"/GB-EUC-V": "gbk",
|
||||
"/GBpc-EUC-H": "gb2312",
|
||||
"/GBpc-EUC-V": "gb2312",
|
||||
"/GBK-EUC-H": "gbk",
|
||||
"/GBK-EUC-V": "gbk",
|
||||
"/GBK2K-H": "gb18030",
|
||||
"/GBK2K-V": "gb18030",
|
||||
"/ETen-B5-H": "cp950",
|
||||
"/ETen-B5-V": "cp950",
|
||||
"/ETenms-B5-H": "cp950",
|
||||
"/ETenms-B5-V": "cp950",
|
||||
"/UniCNS-UTF16-H": "utf-16-be",
|
||||
"/UniCNS-UTF16-V": "utf-16-be",
|
||||
"/UniGB-UTF16-H": "gb18030",
|
||||
"/UniGB-UTF16-V": "gb18030",
|
||||
# UCS2 in code
|
||||
}
|
||||
|
||||
# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
|
||||
_default_fonts_space_width: Dict[str, int] = {
|
||||
"/Courier": 600,
|
||||
"/Courier-Bold": 600,
|
||||
"/Courier-BoldOblique": 600,
|
||||
"/Courier-Oblique": 600,
|
||||
"/Helvetica": 278,
|
||||
"/Helvetica-Bold": 278,
|
||||
"/Helvetica-BoldOblique": 278,
|
||||
"/Helvetica-Oblique": 278,
|
||||
"/Helvetica-Narrow": 228,
|
||||
"/Helvetica-NarrowBold": 228,
|
||||
"/Helvetica-NarrowBoldOblique": 228,
|
||||
"/Helvetica-NarrowOblique": 228,
|
||||
"/Times-Roman": 250,
|
||||
"/Times-Bold": 250,
|
||||
"/Times-BoldItalic": 250,
|
||||
"/Times-Italic": 250,
|
||||
"/Symbol": 250,
|
||||
"/ZapfDingbats": 278,
|
||||
}
|
||||
|
||||
|
||||
def get_encoding(
|
||||
ft: DictionaryObject
|
||||
) -> Tuple[Union[str, Dict[int, str]], Dict[Any, Any]]:
|
||||
encoding = _parse_encoding(ft)
|
||||
map_dict, int_entry = _parse_to_unicode(ft)
|
||||
|
||||
# Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
|
||||
# if cmap not empty encoding should be discarded
|
||||
# (here transformed into identity for those characters)
|
||||
# If encoding is a string it is expected to be an identity translation.
|
||||
if isinstance(encoding, dict):
|
||||
for x in int_entry:
|
||||
if x <= 255:
|
||||
encoding[x] = chr(x)
|
||||
|
||||
return encoding, map_dict
|
||||
|
||||
|
||||
def _parse_encoding(
|
||||
ft: DictionaryObject
|
||||
) -> Union[str, Dict[int, str]]:
|
||||
encoding: Union[str, List[str], Dict[int, str]] = []
|
||||
if "/Encoding" not in ft:
|
||||
if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
|
||||
encoding = dict(
|
||||
zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
|
||||
)
|
||||
else:
|
||||
encoding = "charmap"
|
||||
return encoding
|
||||
enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
|
||||
if isinstance(enc, str):
|
||||
try:
|
||||
# already done : enc = NameObject.unnumber(enc.encode()).decode()
|
||||
# for #xx decoding
|
||||
if enc in charset_encoding:
|
||||
encoding = charset_encoding[enc].copy()
|
||||
elif enc in _predefined_cmap:
|
||||
encoding = _predefined_cmap[enc]
|
||||
elif "-UCS2-" in enc:
|
||||
encoding = "utf-16-be"
|
||||
else:
|
||||
raise Exception("not found")
|
||||
except Exception:
|
||||
logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
|
||||
encoding = enc
|
||||
elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
|
||||
try:
|
||||
encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
|
||||
except Exception:
|
||||
logger_error(
|
||||
f"Advanced encoding {encoding} not implemented yet",
|
||||
__name__,
|
||||
)
|
||||
encoding = charset_encoding["/StandardEncoding"].copy()
|
||||
else:
|
||||
encoding = charset_encoding["/StandardEncoding"].copy()
|
||||
if "/Differences" in enc:
|
||||
x: int = 0
|
||||
o: Union[int, str]
|
||||
for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]):
|
||||
if isinstance(o, int):
|
||||
x = o
|
||||
else: # isinstance(o,str):
|
||||
try:
|
||||
if x < len(encoding):
|
||||
encoding[x] = adobe_glyphs[o] # type: ignore
|
||||
except Exception:
|
||||
encoding[x] = o # type: ignore
|
||||
x += 1
|
||||
if isinstance(encoding, list):
|
||||
encoding = dict(zip(range(256), encoding))
|
||||
return encoding
|
||||
|
||||
|
||||
def _parse_to_unicode(
|
||||
ft: DictionaryObject
|
||||
) -> Tuple[Dict[Any, Any], List[int]]:
|
||||
# will store all translation code
|
||||
# and map_dict[-1] we will have the number of bytes to convert
|
||||
map_dict: Dict[Any, Any] = {}
|
||||
|
||||
# will provide the list of cmap keys as int to correct encoding
|
||||
int_entry: List[int] = []
|
||||
|
||||
if "/ToUnicode" not in ft:
|
||||
if ft.get("/Subtype", "") == "/Type1":
|
||||
return _type1_alternative(ft, map_dict, int_entry)
|
||||
else:
|
||||
return {}, []
|
||||
process_rg: bool = False
|
||||
process_char: bool = False
|
||||
multiline_rg: Union[
|
||||
None, Tuple[int, int]
|
||||
] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
|
||||
cm = prepare_cm(ft)
|
||||
for line in cm.split(b"\n"):
|
||||
process_rg, process_char, multiline_rg = process_cm_line(
|
||||
line.strip(b" \t"),
|
||||
process_rg,
|
||||
process_char,
|
||||
multiline_rg,
|
||||
map_dict,
|
||||
int_entry,
|
||||
)
|
||||
|
||||
return map_dict, int_entry
|
||||
|
||||
|
||||
def get_actual_str_key(
|
||||
value_char: str, encoding: Union[str, Dict[int, str]], map_dict: Dict[Any, Any]
|
||||
) -> str:
|
||||
key_dict = {}
|
||||
if isinstance(encoding, dict):
|
||||
key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char}
|
||||
else:
|
||||
key_dict = {value: key for key, value in map_dict.items() if value == value_char}
|
||||
key_char = key_dict.get(value_char, value_char)
|
||||
return key_char
|
||||
|
||||
|
||||
def prepare_cm(ft: DictionaryObject) -> bytes:
|
||||
tu = ft["/ToUnicode"]
|
||||
cm: bytes
|
||||
if isinstance(tu, StreamObject):
|
||||
cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
|
||||
else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
|
||||
# the full range 0000-FFFF will be processed
|
||||
cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
|
||||
if isinstance(cm, str):
|
||||
cm = cm.encode()
|
||||
# we need to prepare cm before due to missing return line in pdf printed
|
||||
# to pdf from word
|
||||
cm = (
|
||||
cm.strip()
|
||||
.replace(b"beginbfchar", b"\nbeginbfchar\n")
|
||||
.replace(b"endbfchar", b"\nendbfchar\n")
|
||||
.replace(b"beginbfrange", b"\nbeginbfrange\n")
|
||||
.replace(b"endbfrange", b"\nendbfrange\n")
|
||||
.replace(b"<<", b"\n{\n") # text between << and >> not used but
|
||||
.replace(b">>", b"\n}\n") # some solution to find it back
|
||||
)
|
||||
ll = cm.split(b"<")
|
||||
for i in range(len(ll)):
|
||||
j = ll[i].find(b">")
|
||||
if j >= 0:
|
||||
if j == 0:
|
||||
# string is empty: stash a placeholder here (see below)
|
||||
# see https://github.com/py-pdf/pypdf/issues/1111
|
||||
content = b"."
|
||||
else:
|
||||
content = ll[i][:j].replace(b" ", b"")
|
||||
ll[i] = content + b" " + ll[i][j + 1 :]
|
||||
cm = (
|
||||
(b" ".join(ll))
|
||||
.replace(b"[", b" [ ")
|
||||
.replace(b"]", b" ]\n ")
|
||||
.replace(b"\r", b"\n")
|
||||
)
|
||||
return cm
|
||||
|
||||
|
||||
def process_cm_line(
|
||||
line: bytes,
|
||||
process_rg: bool,
|
||||
process_char: bool,
|
||||
multiline_rg: Union[None, Tuple[int, int]],
|
||||
map_dict: Dict[Any, Any],
|
||||
int_entry: List[int],
|
||||
) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
|
||||
if line == b"" or line[0] == 37: # 37 = %
|
||||
return process_rg, process_char, multiline_rg
|
||||
line = line.replace(b"\t", b" ")
|
||||
if b"beginbfrange" in line:
|
||||
process_rg = True
|
||||
elif b"endbfrange" in line:
|
||||
process_rg = False
|
||||
elif b"beginbfchar" in line:
|
||||
process_char = True
|
||||
elif b"endbfchar" in line:
|
||||
process_char = False
|
||||
elif process_rg:
|
||||
try:
|
||||
multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
|
||||
except binascii.Error as error:
|
||||
logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
|
||||
elif process_char:
|
||||
parse_bfchar(line, map_dict, int_entry)
|
||||
return process_rg, process_char, multiline_rg
|
||||
|
||||
|
||||
def parse_bfrange(
|
||||
line: bytes,
|
||||
map_dict: Dict[Any, Any],
|
||||
int_entry: List[int],
|
||||
multiline_rg: Union[None, Tuple[int, int]],
|
||||
) -> Union[None, Tuple[int, int]]:
|
||||
lst = [x for x in line.split(b" ") if x]
|
||||
closure_found = False
|
||||
if multiline_rg is not None:
|
||||
fmt = b"%%0%dX" % (map_dict[-1] * 2)
|
||||
a = multiline_rg[0] # a, b not in the current line
|
||||
b = multiline_rg[1]
|
||||
for sq in lst[0:]:
|
||||
if sq == b"]":
|
||||
closure_found = True
|
||||
break
|
||||
map_dict[
|
||||
unhexlify(fmt % a).decode(
|
||||
"charmap" if map_dict[-1] == 1 else "utf-16-be",
|
||||
"surrogatepass",
|
||||
)
|
||||
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
|
||||
int_entry.append(a)
|
||||
a += 1
|
||||
else:
|
||||
a = int(lst[0], 16)
|
||||
b = int(lst[1], 16)
|
||||
nbi = max(len(lst[0]), len(lst[1]))
|
||||
map_dict[-1] = ceil(nbi / 2)
|
||||
fmt = b"%%0%dX" % (map_dict[-1] * 2)
|
||||
if lst[2] == b"[":
|
||||
for sq in lst[3:]:
|
||||
if sq == b"]":
|
||||
closure_found = True
|
||||
break
|
||||
map_dict[
|
||||
unhexlify(fmt % a).decode(
|
||||
"charmap" if map_dict[-1] == 1 else "utf-16-be",
|
||||
"surrogatepass",
|
||||
)
|
||||
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
|
||||
int_entry.append(a)
|
||||
a += 1
|
||||
else: # case without list
|
||||
c = int(lst[2], 16)
|
||||
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
|
||||
closure_found = True
|
||||
while a <= b:
|
||||
map_dict[
|
||||
unhexlify(fmt % a).decode(
|
||||
"charmap" if map_dict[-1] == 1 else "utf-16-be",
|
||||
"surrogatepass",
|
||||
)
|
||||
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
|
||||
int_entry.append(a)
|
||||
a += 1
|
||||
c += 1
|
||||
return None if closure_found else (a, b)
|
||||
|
||||
|
||||
def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
|
||||
lst = [x for x in line.split(b" ") if x]
|
||||
map_dict[-1] = len(lst[0]) // 2
|
||||
while len(lst) > 1:
|
||||
map_to = ""
|
||||
# placeholder (see above) means empty string
|
||||
if lst[1] != b".":
|
||||
map_to = unhexlify(lst[1]).decode(
|
||||
"charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
|
||||
) # join is here as some cases where the code was split
|
||||
map_dict[
|
||||
unhexlify(lst[0]).decode(
|
||||
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
|
||||
)
|
||||
] = map_to
|
||||
int_entry.append(int(lst[0], 16))
|
||||
lst = lst[2:]
|
||||
|
||||
|
||||
def build_font_width_map(
|
||||
ft: DictionaryObject, default_font_width: float
|
||||
) -> Dict[Any, float]:
|
||||
font_width_map: Dict[Any, float] = {}
|
||||
st: int = 0
|
||||
en: int = 0
|
||||
try:
|
||||
default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object())] * 2.0
|
||||
except KeyError:
|
||||
pass
|
||||
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
|
||||
# §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
|
||||
# Widths for a CIDFont are defined using the DW and W entries.
|
||||
# DW2 and W2 are for vertical use. Vertical type is not implemented.
|
||||
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
|
||||
if "/DW" in ft1:
|
||||
font_width_map["default"] = cast(float, ft1["/DW"].get_object())
|
||||
else:
|
||||
font_width_map["default"] = default_font_width
|
||||
if "/W" in ft1:
|
||||
w = ft1["/W"].get_object()
|
||||
else:
|
||||
w = []
|
||||
while len(w) > 0:
|
||||
st = w[0] if isinstance(w[0], int) else w[0].get_object()
|
||||
second = w[1].get_object()
|
||||
if isinstance(second, int):
|
||||
# C_first C_last same_W
|
||||
en = second
|
||||
width = w[2].get_object()
|
||||
if not isinstance(width, (int, float)):
|
||||
logger_warning(f"Expected numeric value for width, got {width}. Ignoring it.", __name__)
|
||||
w = w[3:]
|
||||
continue
|
||||
for c_code in range(st, en + 1):
|
||||
font_width_map[chr(c_code)] = width
|
||||
w = w[3:]
|
||||
elif isinstance(second, list):
|
||||
# Starting_C [W1 W2 ... Wn]
|
||||
c_code = st
|
||||
for ww in second:
|
||||
width = ww.get_object()
|
||||
font_width_map[chr(c_code)] = width
|
||||
c_code += 1
|
||||
w = w[2:]
|
||||
else:
|
||||
logger_warning(
|
||||
"unknown widths : \n" + (ft1["/W"]).__repr__(),
|
||||
__name__,
|
||||
)
|
||||
break
|
||||
elif "/Widths" in ft:
|
||||
w = ft["/Widths"].get_object()
|
||||
if "/FontDescriptor" in ft and "/MissingWidth" in cast(
|
||||
DictionaryObject, ft["/FontDescriptor"]
|
||||
):
|
||||
font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
|
||||
else:
|
||||
# will consider width of char as avg(width)
|
||||
m = 0
|
||||
cpt = 0
|
||||
for xx in w:
|
||||
xx = xx.get_object()
|
||||
if xx > 0:
|
||||
m += xx
|
||||
cpt += 1
|
||||
font_width_map["default"] = m / max(1, cpt)
|
||||
st = cast(int, ft["/FirstChar"])
|
||||
en = cast(int, ft["/LastChar"])
|
||||
for c_code in range(st, en + 1):
|
||||
try:
|
||||
width = w[c_code - st].get_object()
|
||||
font_width_map[chr(c_code)] = width
|
||||
except (IndexError, KeyError):
|
||||
# The PDF structure is invalid. The array is too small
|
||||
# for the specified font width.
|
||||
pass
|
||||
if is_null_or_none(font_width_map.get("default")):
|
||||
font_width_map["default"] = default_font_width if default_font_width else 0.0
|
||||
return font_width_map
|
||||
|
||||
|
||||
def compute_space_width(
|
||||
font_width_map: Dict[Any, float], space_char: str
|
||||
) -> float:
|
||||
try:
|
||||
sp_width = font_width_map[space_char]
|
||||
if sp_width == 0:
|
||||
raise ValueError("Zero width")
|
||||
except (KeyError, ValueError):
|
||||
sp_width = (
|
||||
font_width_map["default"] / 2.0
|
||||
) # if using default we consider space will be only half size
|
||||
|
||||
return sp_width
|
||||
|
||||
|
||||
def compute_font_width(
|
||||
font_width_map: Dict[Any, float],
|
||||
char: str
|
||||
) -> float:
|
||||
char_width: float = 0.0
|
||||
try:
|
||||
char_width = font_width_map[char]
|
||||
except KeyError:
|
||||
char_width = (
|
||||
font_width_map["default"]
|
||||
)
|
||||
|
||||
return char_width
|
||||
|
||||
|
||||
def _type1_alternative(
|
||||
ft: DictionaryObject,
|
||||
map_dict: Dict[Any, Any],
|
||||
int_entry: List[int],
|
||||
) -> Tuple[Dict[Any, Any], List[int]]:
|
||||
if "/FontDescriptor" not in ft:
|
||||
return map_dict, int_entry
|
||||
ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
|
||||
if is_null_or_none(ft_desc):
|
||||
return map_dict, int_entry
|
||||
assert ft_desc is not None, "mypy"
|
||||
txt = ft_desc.get_object().get_data()
|
||||
txt = txt.split(b"eexec\n")[0] # only clear part
|
||||
txt = txt.split(b"/Encoding")[1] # to get the encoding part
|
||||
lines = txt.replace(b"\r", b"\n").split(b"\n")
|
||||
for li in lines:
|
||||
if li.startswith(b"dup"):
|
||||
words = [_w for _w in li.split(b" ") if _w != b""]
|
||||
if len(words) > 3 and words[3] != b"put":
|
||||
continue
|
||||
try:
|
||||
i = int(words[1])
|
||||
except ValueError: # pragma: no cover
|
||||
continue
|
||||
try:
|
||||
v = adobe_glyphs[words[2].decode()]
|
||||
except KeyError:
|
||||
if words[2].startswith(b"/uni"):
|
||||
try:
|
||||
v = chr(int(words[2][4:], 16))
|
||||
except ValueError: # pragma: no cover
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
map_dict[chr(i)] = v
|
||||
int_entry.append(i)
|
||||
return map_dict, int_entry
|
||||
61
venv/lib/python3.12/site-packages/pypdf/_codecs/__init__.py
Normal file
61
venv/lib/python3.12/site-packages/pypdf/_codecs/__init__.py
Normal file
@ -0,0 +1,61 @@
|
||||
from typing import Dict, List
|
||||
|
||||
from .adobe_glyphs import adobe_glyphs
|
||||
from .pdfdoc import _pdfdoc_encoding
|
||||
from .std import _std_encoding
|
||||
from .symbol import _symbol_encoding
|
||||
from .zapfding import _zapfding_encoding
|
||||
|
||||
|
||||
def fill_from_encoding(enc: str) -> List[str]:
|
||||
lst: List[str] = []
|
||||
for x in range(256):
|
||||
try:
|
||||
lst += (bytes((x,)).decode(enc),)
|
||||
except Exception:
|
||||
lst += (chr(x),)
|
||||
return lst
|
||||
|
||||
|
||||
def rev_encoding(enc: List[str]) -> Dict[str, int]:
|
||||
rev: Dict[str, int] = {}
|
||||
for i in range(256):
|
||||
char = enc[i]
|
||||
if char == "\u0000":
|
||||
continue
|
||||
assert char not in rev, f"{char} at {i} already at {rev[char]}"
|
||||
rev[char] = i
|
||||
return rev
|
||||
|
||||
|
||||
_win_encoding = fill_from_encoding("cp1252")
|
||||
_mac_encoding = fill_from_encoding("mac_roman")
|
||||
|
||||
|
||||
_win_encoding_rev: Dict[str, int] = rev_encoding(_win_encoding)
|
||||
_mac_encoding_rev: Dict[str, int] = rev_encoding(_mac_encoding)
|
||||
_symbol_encoding_rev: Dict[str, int] = rev_encoding(_symbol_encoding)
|
||||
_zapfding_encoding_rev: Dict[str, int] = rev_encoding(_zapfding_encoding)
|
||||
_pdfdoc_encoding_rev: Dict[str, int] = rev_encoding(_pdfdoc_encoding)
|
||||
|
||||
|
||||
charset_encoding: Dict[str, List[str]] = {
|
||||
"/StandardEncoding": _std_encoding,
|
||||
"/WinAnsiEncoding": _win_encoding,
|
||||
"/MacRomanEncoding": _mac_encoding,
|
||||
"/PDFDocEncoding": _pdfdoc_encoding,
|
||||
"/Symbol": _symbol_encoding,
|
||||
"/ZapfDingbats": _zapfding_encoding,
|
||||
}
|
||||
|
||||
__all__ = [
|
||||
"_mac_encoding",
|
||||
"_pdfdoc_encoding",
|
||||
"_pdfdoc_encoding_rev",
|
||||
"_std_encoding",
|
||||
"_symbol_encoding",
|
||||
"_win_encoding",
|
||||
"_zapfding_encoding",
|
||||
"adobe_glyphs",
|
||||
"charset_encoding",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
268
venv/lib/python3.12/site-packages/pypdf/_codecs/_codecs.py
Normal file
268
venv/lib/python3.12/site-packages/pypdf/_codecs/_codecs.py
Normal file
@ -0,0 +1,268 @@
|
||||
"""
|
||||
This module is for codecs only.
|
||||
|
||||
While the codec implementation can contain details of the PDF specification,
|
||||
the module should not do any PDF parsing.
|
||||
"""
|
||||
|
||||
import io
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List
|
||||
|
||||
from pypdf._utils import logger_warning
|
||||
|
||||
|
||||
class Codec(ABC):
|
||||
"""Abstract base class for all codecs."""
|
||||
|
||||
@abstractmethod
|
||||
def encode(self, data: bytes) -> bytes:
|
||||
"""
|
||||
Encode the input data.
|
||||
|
||||
Args:
|
||||
data: Data to encode.
|
||||
|
||||
Returns:
|
||||
Encoded data.
|
||||
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def decode(self, data: bytes) -> bytes:
|
||||
"""
|
||||
Decode the input data.
|
||||
|
||||
Args:
|
||||
data: Data to decode.
|
||||
|
||||
Returns:
|
||||
Decoded data.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
class LzwCodec(Codec):
|
||||
"""Lempel-Ziv-Welch (LZW) adaptive compression codec."""
|
||||
|
||||
CLEAR_TABLE_MARKER = 256 # Special code to indicate table reset
|
||||
EOD_MARKER = 257 # End-of-data marker
|
||||
INITIAL_BITS_PER_CODE = 9 # Initial code bit width
|
||||
MAX_BITS_PER_CODE = 12 # Maximum code bit width
|
||||
|
||||
def _initialize_encoding_table(self) -> None:
|
||||
"""Initialize the encoding table and state to initial conditions."""
|
||||
self.encoding_table: Dict[bytes, int] = {bytes([i]): i for i in range(256)}
|
||||
self.next_code = self.EOD_MARKER + 1
|
||||
self.bits_per_code = self.INITIAL_BITS_PER_CODE
|
||||
self.max_code_value = (1 << self.bits_per_code) - 1
|
||||
|
||||
def _increase_next_code(self) -> None:
|
||||
"""Update bits_per_code and max_code_value if necessary."""
|
||||
self.next_code += 1
|
||||
if (
|
||||
self.next_code > self.max_code_value
|
||||
and self.bits_per_code < self.MAX_BITS_PER_CODE
|
||||
):
|
||||
self.bits_per_code += 1
|
||||
self.max_code_value = (1 << self.bits_per_code) - 1
|
||||
|
||||
def encode(self, data: bytes) -> bytes:
|
||||
"""
|
||||
Encode data using the LZW compression algorithm.
|
||||
|
||||
Taken from PDF 1.7 specs, "7.4.4.2 Details of LZW Encoding".
|
||||
"""
|
||||
result_codes: List[int] = []
|
||||
|
||||
# The encoder shall begin by issuing a clear-table code
|
||||
result_codes.append(self.CLEAR_TABLE_MARKER)
|
||||
self._initialize_encoding_table()
|
||||
|
||||
current_sequence = b""
|
||||
for byte in data:
|
||||
next_sequence = current_sequence + bytes([byte])
|
||||
|
||||
if next_sequence in self.encoding_table:
|
||||
# Extend current sequence if already in the table
|
||||
current_sequence = next_sequence
|
||||
else:
|
||||
# Output code for the current sequence
|
||||
result_codes.append(self.encoding_table[current_sequence])
|
||||
|
||||
# Add the new sequence to the table if there's room
|
||||
if self.next_code <= (1 << self.MAX_BITS_PER_CODE) - 1:
|
||||
self.encoding_table[next_sequence] = self.next_code
|
||||
self._increase_next_code()
|
||||
else:
|
||||
# If the table is full, emit a clear-table command
|
||||
result_codes.append(self.CLEAR_TABLE_MARKER)
|
||||
self._initialize_encoding_table()
|
||||
|
||||
# Start new sequence
|
||||
current_sequence = bytes([byte])
|
||||
|
||||
# Ensure everything actually is encoded
|
||||
if current_sequence:
|
||||
result_codes.append(self.encoding_table[current_sequence])
|
||||
result_codes.append(self.EOD_MARKER)
|
||||
|
||||
return self._pack_codes_into_bytes(result_codes)
|
||||
|
||||
def _pack_codes_into_bytes(self, codes: List[int]) -> bytes:
|
||||
"""
|
||||
Convert the list of result codes into a continuous byte stream, with codes packed as per the code bit-width.
|
||||
The bit-width starts at 9 bits and expands as needed.
|
||||
"""
|
||||
self._initialize_encoding_table()
|
||||
buffer = 0
|
||||
bits_in_buffer = 0
|
||||
output = bytearray()
|
||||
|
||||
for code in codes:
|
||||
buffer = (buffer << self.bits_per_code) | code
|
||||
bits_in_buffer += self.bits_per_code
|
||||
|
||||
# Codes shall be packed into a continuous bit stream, high-order bit
|
||||
# first. This stream shall then be divided into bytes, high-order bit
|
||||
# first.
|
||||
while bits_in_buffer >= 8:
|
||||
bits_in_buffer -= 8
|
||||
output.append((buffer >> bits_in_buffer) & 0xFF)
|
||||
|
||||
if code == self.CLEAR_TABLE_MARKER:
|
||||
self._initialize_encoding_table()
|
||||
elif code == self.EOD_MARKER:
|
||||
continue
|
||||
else:
|
||||
self._increase_next_code()
|
||||
|
||||
# Flush any remaining bits in the buffer
|
||||
if bits_in_buffer > 0:
|
||||
output.append((buffer << (8 - bits_in_buffer)) & 0xFF)
|
||||
|
||||
return bytes(output)
|
||||
|
||||
def _initialize_decoding_table(self) -> None:
|
||||
self.max_code_value = (1 << self.MAX_BITS_PER_CODE) - 1
|
||||
self.decoding_table = [bytes([i]) for i in range(self.CLEAR_TABLE_MARKER)] + [
|
||||
b""
|
||||
] * (self.max_code_value - self.CLEAR_TABLE_MARKER + 1)
|
||||
self._table_index = self.EOD_MARKER + 1
|
||||
self._bits_to_get = 9
|
||||
|
||||
def _next_code_decode(self, data: bytes) -> int:
|
||||
self._next_data: int
|
||||
try:
|
||||
while self._next_bits < self._bits_to_get:
|
||||
self._next_data = (self._next_data << 8) | (
|
||||
data[self._byte_pointer] & 0xFF
|
||||
)
|
||||
self._byte_pointer += 1
|
||||
self._next_bits += 8
|
||||
|
||||
code = (
|
||||
self._next_data >> (self._next_bits - self._bits_to_get)
|
||||
) & self._and_table[self._bits_to_get - 9]
|
||||
self._next_bits -= self._bits_to_get
|
||||
|
||||
return code
|
||||
except IndexError:
|
||||
return self.EOD_MARKER
|
||||
|
||||
# The following method has been converted to Python from PDFsharp:
|
||||
# https://github.com/empira/PDFsharp/blob/5fbf6ed14740bc4e16786816882d32e43af3ff5d/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
|
||||
#
|
||||
# Original license:
|
||||
#
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) 2001-2024 empira Software GmbH, Troisdorf (Cologne Area),
|
||||
# Germany
|
||||
#
|
||||
# http://docs.pdfsharp.net
|
||||
#
|
||||
# MIT License
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included
|
||||
# in all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
# DEALINGS IN THE SOFTWARE.
|
||||
# --------------------------------------------------------------------------
|
||||
def decode(self, data: bytes) -> bytes:
|
||||
"""
|
||||
The following code was converted to Python from the following code:
|
||||
https://github.com/empira/PDFsharp/blob/master/src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Filters/LzwDecode.cs
|
||||
"""
|
||||
self._and_table = [511, 1023, 2047, 4095]
|
||||
self._table_index = 0
|
||||
self._bits_to_get = 9
|
||||
self._byte_pointer = 0
|
||||
self._next_data = 0
|
||||
self._next_bits = 0
|
||||
|
||||
output_stream = io.BytesIO()
|
||||
|
||||
self._initialize_decoding_table()
|
||||
self._byte_pointer = 0
|
||||
self._next_data = 0
|
||||
self._next_bits = 0
|
||||
old_code = self.CLEAR_TABLE_MARKER
|
||||
|
||||
while True:
|
||||
code = self._next_code_decode(data)
|
||||
if code == self.EOD_MARKER:
|
||||
break
|
||||
|
||||
if code == self.CLEAR_TABLE_MARKER:
|
||||
self._initialize_decoding_table()
|
||||
code = self._next_code_decode(data)
|
||||
if code == self.EOD_MARKER:
|
||||
break
|
||||
output_stream.write(self.decoding_table[code])
|
||||
old_code = code
|
||||
elif code < self._table_index:
|
||||
string = self.decoding_table[code]
|
||||
output_stream.write(string)
|
||||
if old_code != self.CLEAR_TABLE_MARKER:
|
||||
self._add_entry_decode(self.decoding_table[old_code], string[0])
|
||||
old_code = code
|
||||
else:
|
||||
# The code is not in the table and not one of the special codes
|
||||
string = (
|
||||
self.decoding_table[old_code] + self.decoding_table[old_code][:1]
|
||||
)
|
||||
output_stream.write(string)
|
||||
self._add_entry_decode(self.decoding_table[old_code], string[0])
|
||||
old_code = code
|
||||
|
||||
output = output_stream.getvalue()
|
||||
return output
|
||||
|
||||
def _add_entry_decode(self, old_string: bytes, new_char: int) -> None:
|
||||
new_string = old_string + bytes([new_char])
|
||||
if self._table_index > self.max_code_value:
|
||||
logger_warning("Ignoring too large LZW table index.", __name__)
|
||||
return
|
||||
self.decoding_table[self._table_index] = new_string
|
||||
self._table_index += 1
|
||||
|
||||
# Update the number of bits to get based on the table index
|
||||
if self._table_index == 511:
|
||||
self._bits_to_get = 10
|
||||
elif self._table_index == 1023:
|
||||
self._bits_to_get = 11
|
||||
elif self._table_index == 2047:
|
||||
self._bits_to_get = 12
|
||||
13969
venv/lib/python3.12/site-packages/pypdf/_codecs/adobe_glyphs.py
Normal file
13969
venv/lib/python3.12/site-packages/pypdf/_codecs/adobe_glyphs.py
Normal file
File diff suppressed because it is too large
Load Diff
264
venv/lib/python3.12/site-packages/pypdf/_codecs/pdfdoc.py
Normal file
264
venv/lib/python3.12/site-packages/pypdf/_codecs/pdfdoc.py
Normal file
@ -0,0 +1,264 @@
|
||||
# PDFDocEncoding Character Set: Table D.2 of PDF Reference 1.7
|
||||
# C.1 Predefined encodings sorted by character name of another PDF reference
|
||||
# Some indices have '\u0000' although they should have something else:
|
||||
# 22: should be '\u0017'
|
||||
_pdfdoc_encoding = [
|
||||
"\u0000",
|
||||
"\u0001",
|
||||
"\u0002",
|
||||
"\u0003",
|
||||
"\u0004",
|
||||
"\u0005",
|
||||
"\u0006",
|
||||
"\u0007", # 0 - 7
|
||||
"\u0008",
|
||||
"\u0009",
|
||||
"\u000a",
|
||||
"\u000b",
|
||||
"\u000c",
|
||||
"\u000d",
|
||||
"\u000e",
|
||||
"\u000f", # 8 - 15
|
||||
"\u0010",
|
||||
"\u0011",
|
||||
"\u0012",
|
||||
"\u0013",
|
||||
"\u0014",
|
||||
"\u0015",
|
||||
"\u0000",
|
||||
"\u0017", # 16 - 23
|
||||
"\u02d8",
|
||||
"\u02c7",
|
||||
"\u02c6",
|
||||
"\u02d9",
|
||||
"\u02dd",
|
||||
"\u02db",
|
||||
"\u02da",
|
||||
"\u02dc", # 24 - 31
|
||||
"\u0020",
|
||||
"\u0021",
|
||||
"\u0022",
|
||||
"\u0023",
|
||||
"\u0024",
|
||||
"\u0025",
|
||||
"\u0026",
|
||||
"\u0027", # 32 - 39
|
||||
"\u0028",
|
||||
"\u0029",
|
||||
"\u002a",
|
||||
"\u002b",
|
||||
"\u002c",
|
||||
"\u002d",
|
||||
"\u002e",
|
||||
"\u002f", # 40 - 47
|
||||
"\u0030",
|
||||
"\u0031",
|
||||
"\u0032",
|
||||
"\u0033",
|
||||
"\u0034",
|
||||
"\u0035",
|
||||
"\u0036",
|
||||
"\u0037", # 48 - 55
|
||||
"\u0038",
|
||||
"\u0039",
|
||||
"\u003a",
|
||||
"\u003b",
|
||||
"\u003c",
|
||||
"\u003d",
|
||||
"\u003e",
|
||||
"\u003f", # 56 - 63
|
||||
"\u0040",
|
||||
"\u0041",
|
||||
"\u0042",
|
||||
"\u0043",
|
||||
"\u0044",
|
||||
"\u0045",
|
||||
"\u0046",
|
||||
"\u0047", # 64 - 71
|
||||
"\u0048",
|
||||
"\u0049",
|
||||
"\u004a",
|
||||
"\u004b",
|
||||
"\u004c",
|
||||
"\u004d",
|
||||
"\u004e",
|
||||
"\u004f", # 72 - 79
|
||||
"\u0050",
|
||||
"\u0051",
|
||||
"\u0052",
|
||||
"\u0053",
|
||||
"\u0054",
|
||||
"\u0055",
|
||||
"\u0056",
|
||||
"\u0057", # 80 - 87
|
||||
"\u0058",
|
||||
"\u0059",
|
||||
"\u005a",
|
||||
"\u005b",
|
||||
"\u005c",
|
||||
"\u005d",
|
||||
"\u005e",
|
||||
"\u005f", # 88 - 95
|
||||
"\u0060",
|
||||
"\u0061",
|
||||
"\u0062",
|
||||
"\u0063",
|
||||
"\u0064",
|
||||
"\u0065",
|
||||
"\u0066",
|
||||
"\u0067", # 96 - 103
|
||||
"\u0068",
|
||||
"\u0069",
|
||||
"\u006a",
|
||||
"\u006b",
|
||||
"\u006c",
|
||||
"\u006d",
|
||||
"\u006e",
|
||||
"\u006f", # 104 - 111
|
||||
"\u0070",
|
||||
"\u0071",
|
||||
"\u0072",
|
||||
"\u0073",
|
||||
"\u0074",
|
||||
"\u0075",
|
||||
"\u0076",
|
||||
"\u0077", # 112 - 119
|
||||
"\u0078",
|
||||
"\u0079",
|
||||
"\u007a",
|
||||
"\u007b",
|
||||
"\u007c",
|
||||
"\u007d",
|
||||
"\u007e",
|
||||
"\u0000", # 120 - 127
|
||||
"\u2022",
|
||||
"\u2020",
|
||||
"\u2021",
|
||||
"\u2026",
|
||||
"\u2014",
|
||||
"\u2013",
|
||||
"\u0192",
|
||||
"\u2044", # 128 - 135
|
||||
"\u2039",
|
||||
"\u203a",
|
||||
"\u2212",
|
||||
"\u2030",
|
||||
"\u201e",
|
||||
"\u201c",
|
||||
"\u201d",
|
||||
"\u2018", # 136 - 143
|
||||
"\u2019",
|
||||
"\u201a",
|
||||
"\u2122",
|
||||
"\ufb01",
|
||||
"\ufb02",
|
||||
"\u0141",
|
||||
"\u0152",
|
||||
"\u0160", # 144 - 151
|
||||
"\u0178",
|
||||
"\u017d",
|
||||
"\u0131",
|
||||
"\u0142",
|
||||
"\u0153",
|
||||
"\u0161",
|
||||
"\u017e",
|
||||
"\u0000", # 152 - 159
|
||||
"\u20ac",
|
||||
"\u00a1",
|
||||
"\u00a2",
|
||||
"\u00a3",
|
||||
"\u00a4",
|
||||
"\u00a5",
|
||||
"\u00a6",
|
||||
"\u00a7", # 160 - 167
|
||||
"\u00a8",
|
||||
"\u00a9",
|
||||
"\u00aa",
|
||||
"\u00ab",
|
||||
"\u00ac",
|
||||
"\u0000",
|
||||
"\u00ae",
|
||||
"\u00af", # 168 - 175
|
||||
"\u00b0",
|
||||
"\u00b1",
|
||||
"\u00b2",
|
||||
"\u00b3",
|
||||
"\u00b4",
|
||||
"\u00b5",
|
||||
"\u00b6",
|
||||
"\u00b7", # 176 - 183
|
||||
"\u00b8",
|
||||
"\u00b9",
|
||||
"\u00ba",
|
||||
"\u00bb",
|
||||
"\u00bc",
|
||||
"\u00bd",
|
||||
"\u00be",
|
||||
"\u00bf", # 184 - 191
|
||||
"\u00c0",
|
||||
"\u00c1",
|
||||
"\u00c2",
|
||||
"\u00c3",
|
||||
"\u00c4",
|
||||
"\u00c5",
|
||||
"\u00c6",
|
||||
"\u00c7", # 192 - 199
|
||||
"\u00c8",
|
||||
"\u00c9",
|
||||
"\u00ca",
|
||||
"\u00cb",
|
||||
"\u00cc",
|
||||
"\u00cd",
|
||||
"\u00ce",
|
||||
"\u00cf", # 200 - 207
|
||||
"\u00d0",
|
||||
"\u00d1",
|
||||
"\u00d2",
|
||||
"\u00d3",
|
||||
"\u00d4",
|
||||
"\u00d5",
|
||||
"\u00d6",
|
||||
"\u00d7", # 208 - 215
|
||||
"\u00d8",
|
||||
"\u00d9",
|
||||
"\u00da",
|
||||
"\u00db",
|
||||
"\u00dc",
|
||||
"\u00dd",
|
||||
"\u00de",
|
||||
"\u00df", # 216 - 223
|
||||
"\u00e0",
|
||||
"\u00e1",
|
||||
"\u00e2",
|
||||
"\u00e3",
|
||||
"\u00e4",
|
||||
"\u00e5",
|
||||
"\u00e6",
|
||||
"\u00e7", # 224 - 231
|
||||
"\u00e8",
|
||||
"\u00e9",
|
||||
"\u00ea",
|
||||
"\u00eb",
|
||||
"\u00ec",
|
||||
"\u00ed",
|
||||
"\u00ee",
|
||||
"\u00ef", # 232 - 239
|
||||
"\u00f0",
|
||||
"\u00f1",
|
||||
"\u00f2",
|
||||
"\u00f3",
|
||||
"\u00f4",
|
||||
"\u00f5",
|
||||
"\u00f6",
|
||||
"\u00f7", # 240 - 247
|
||||
"\u00f8",
|
||||
"\u00f9",
|
||||
"\u00fa",
|
||||
"\u00fb",
|
||||
"\u00fc",
|
||||
"\u00fd",
|
||||
"\u00fe",
|
||||
"\u00ff", # 248 - 255
|
||||
]
|
||||
|
||||
assert len(_pdfdoc_encoding) == 256
|
||||
258
venv/lib/python3.12/site-packages/pypdf/_codecs/std.py
Normal file
258
venv/lib/python3.12/site-packages/pypdf/_codecs/std.py
Normal file
@ -0,0 +1,258 @@
|
||||
_std_encoding = [
|
||||
"\x00",
|
||||
"\x01",
|
||||
"\x02",
|
||||
"\x03",
|
||||
"\x04",
|
||||
"\x05",
|
||||
"\x06",
|
||||
"\x07",
|
||||
"\x08",
|
||||
"\t",
|
||||
"\n",
|
||||
"\x0b",
|
||||
"\x0c",
|
||||
"\r",
|
||||
"\x0e",
|
||||
"\x0f",
|
||||
"\x10",
|
||||
"\x11",
|
||||
"\x12",
|
||||
"\x13",
|
||||
"\x14",
|
||||
"\x15",
|
||||
"\x16",
|
||||
"\x17",
|
||||
"\x18",
|
||||
"\x19",
|
||||
"\x1a",
|
||||
"\x1b",
|
||||
"\x1c",
|
||||
"\x1d",
|
||||
"\x1e",
|
||||
"\x1f",
|
||||
" ",
|
||||
"!",
|
||||
'"',
|
||||
"#",
|
||||
"$",
|
||||
"%",
|
||||
"&",
|
||||
"’",
|
||||
"(",
|
||||
")",
|
||||
"*",
|
||||
"+",
|
||||
",",
|
||||
"-",
|
||||
".",
|
||||
"/",
|
||||
"0",
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"7",
|
||||
"8",
|
||||
"9",
|
||||
":",
|
||||
";",
|
||||
"<",
|
||||
"=",
|
||||
">",
|
||||
"?",
|
||||
"@",
|
||||
"A",
|
||||
"B",
|
||||
"C",
|
||||
"D",
|
||||
"E",
|
||||
"F",
|
||||
"G",
|
||||
"H",
|
||||
"I",
|
||||
"J",
|
||||
"K",
|
||||
"L",
|
||||
"M",
|
||||
"N",
|
||||
"O",
|
||||
"P",
|
||||
"Q",
|
||||
"R",
|
||||
"S",
|
||||
"T",
|
||||
"U",
|
||||
"V",
|
||||
"W",
|
||||
"X",
|
||||
"Y",
|
||||
"Z",
|
||||
"[",
|
||||
"\\",
|
||||
"]",
|
||||
"^",
|
||||
"_",
|
||||
"‘",
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d",
|
||||
"e",
|
||||
"f",
|
||||
"g",
|
||||
"h",
|
||||
"i",
|
||||
"j",
|
||||
"k",
|
||||
"l",
|
||||
"m",
|
||||
"n",
|
||||
"o",
|
||||
"p",
|
||||
"q",
|
||||
"r",
|
||||
"s",
|
||||
"t",
|
||||
"u",
|
||||
"v",
|
||||
"w",
|
||||
"x",
|
||||
"y",
|
||||
"z",
|
||||
"{",
|
||||
"|",
|
||||
"}",
|
||||
"~",
|
||||
"\x7f",
|
||||
"\x80",
|
||||
"\x81",
|
||||
"\x82",
|
||||
"\x83",
|
||||
"\x84",
|
||||
"\x85",
|
||||
"\x86",
|
||||
"\x87",
|
||||
"\x88",
|
||||
"\x89",
|
||||
"\x8a",
|
||||
"\x8b",
|
||||
"\x8c",
|
||||
"\x8d",
|
||||
"\x8e",
|
||||
"\x8f",
|
||||
"\x90",
|
||||
"\x91",
|
||||
"\x92",
|
||||
"\x93",
|
||||
"\x94",
|
||||
"\x95",
|
||||
"\x96",
|
||||
"\x97",
|
||||
"\x98",
|
||||
"\x99",
|
||||
"\x9a",
|
||||
"\x9b",
|
||||
"\x9c",
|
||||
"\x9d",
|
||||
"\x9e",
|
||||
"\x9f",
|
||||
"\xa0",
|
||||
"¡",
|
||||
"¢",
|
||||
"£",
|
||||
"⁄",
|
||||
"¥",
|
||||
"ƒ",
|
||||
"§",
|
||||
"¤",
|
||||
"'",
|
||||
"“",
|
||||
"«",
|
||||
"‹",
|
||||
"›",
|
||||
"fi",
|
||||
"fl",
|
||||
"°",
|
||||
"–",
|
||||
"†",
|
||||
"‡",
|
||||
"·",
|
||||
"µ",
|
||||
"¶",
|
||||
"•",
|
||||
"‚",
|
||||
"„",
|
||||
"”",
|
||||
"»",
|
||||
"…",
|
||||
"‰",
|
||||
"¾",
|
||||
"¿",
|
||||
"À",
|
||||
"`",
|
||||
"´",
|
||||
"ˆ",
|
||||
"˜",
|
||||
"¯",
|
||||
"˘",
|
||||
"˙",
|
||||
"¨",
|
||||
"É",
|
||||
"˚",
|
||||
"¸",
|
||||
"Ì",
|
||||
"˝",
|
||||
"˛",
|
||||
"ˇ",
|
||||
"—",
|
||||
"Ñ",
|
||||
"Ò",
|
||||
"Ó",
|
||||
"Ô",
|
||||
"Õ",
|
||||
"Ö",
|
||||
"×",
|
||||
"Ø",
|
||||
"Ù",
|
||||
"Ú",
|
||||
"Û",
|
||||
"Ü",
|
||||
"Ý",
|
||||
"Þ",
|
||||
"ß",
|
||||
"à",
|
||||
"Æ",
|
||||
"â",
|
||||
"ª",
|
||||
"ä",
|
||||
"å",
|
||||
"æ",
|
||||
"ç",
|
||||
"Ł",
|
||||
"Ø",
|
||||
"Œ",
|
||||
"º",
|
||||
"ì",
|
||||
"í",
|
||||
"î",
|
||||
"ï",
|
||||
"ð",
|
||||
"æ",
|
||||
"ò",
|
||||
"ó",
|
||||
"ô",
|
||||
"ı",
|
||||
"ö",
|
||||
"÷",
|
||||
"ł",
|
||||
"ø",
|
||||
"œ",
|
||||
"ß",
|
||||
"ü",
|
||||
"ý",
|
||||
"þ",
|
||||
"ÿ",
|
||||
]
|
||||
260
venv/lib/python3.12/site-packages/pypdf/_codecs/symbol.py
Normal file
260
venv/lib/python3.12/site-packages/pypdf/_codecs/symbol.py
Normal file
@ -0,0 +1,260 @@
|
||||
# manually generated from https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/symbol.txt
|
||||
_symbol_encoding = [
|
||||
"\u0000",
|
||||
"\u0001",
|
||||
"\u0002",
|
||||
"\u0003",
|
||||
"\u0004",
|
||||
"\u0005",
|
||||
"\u0006",
|
||||
"\u0007",
|
||||
"\u0008",
|
||||
"\u0009",
|
||||
"\u000A",
|
||||
"\u000B",
|
||||
"\u000C",
|
||||
"\u000D",
|
||||
"\u000E",
|
||||
"\u000F",
|
||||
"\u0010",
|
||||
"\u0011",
|
||||
"\u0012",
|
||||
"\u0013",
|
||||
"\u0014",
|
||||
"\u0015",
|
||||
"\u0016",
|
||||
"\u0017",
|
||||
"\u0018",
|
||||
"\u0019",
|
||||
"\u001A",
|
||||
"\u001B",
|
||||
"\u001C",
|
||||
"\u001D",
|
||||
"\u001E",
|
||||
"\u001F",
|
||||
"\u0020",
|
||||
"\u0021",
|
||||
"\u2200",
|
||||
"\u0023",
|
||||
"\u2203",
|
||||
"\u0025",
|
||||
"\u0026",
|
||||
"\u220B",
|
||||
"\u0028",
|
||||
"\u0029",
|
||||
"\u2217",
|
||||
"\u002B",
|
||||
"\u002C",
|
||||
"\u2212",
|
||||
"\u002E",
|
||||
"\u002F",
|
||||
"\u0030",
|
||||
"\u0031",
|
||||
"\u0032",
|
||||
"\u0033",
|
||||
"\u0034",
|
||||
"\u0035",
|
||||
"\u0036",
|
||||
"\u0037",
|
||||
"\u0038",
|
||||
"\u0039",
|
||||
"\u003A",
|
||||
"\u003B",
|
||||
"\u003C",
|
||||
"\u003D",
|
||||
"\u003E",
|
||||
"\u003F",
|
||||
"\u2245",
|
||||
"\u0391",
|
||||
"\u0392",
|
||||
"\u03A7",
|
||||
"\u0394",
|
||||
"\u0395",
|
||||
"\u03A6",
|
||||
"\u0393",
|
||||
"\u0397",
|
||||
"\u0399",
|
||||
"\u03D1",
|
||||
"\u039A",
|
||||
"\u039B",
|
||||
"\u039C",
|
||||
"\u039D",
|
||||
"\u039F",
|
||||
"\u03A0",
|
||||
"\u0398",
|
||||
"\u03A1",
|
||||
"\u03A3",
|
||||
"\u03A4",
|
||||
"\u03A5",
|
||||
"\u03C2",
|
||||
"\u03A9",
|
||||
"\u039E",
|
||||
"\u03A8",
|
||||
"\u0396",
|
||||
"\u005B",
|
||||
"\u2234",
|
||||
"\u005D",
|
||||
"\u22A5",
|
||||
"\u005F",
|
||||
"\uF8E5",
|
||||
"\u03B1",
|
||||
"\u03B2",
|
||||
"\u03C7",
|
||||
"\u03B4",
|
||||
"\u03B5",
|
||||
"\u03C6",
|
||||
"\u03B3",
|
||||
"\u03B7",
|
||||
"\u03B9",
|
||||
"\u03D5",
|
||||
"\u03BA",
|
||||
"\u03BB",
|
||||
"\u00B5",
|
||||
"\u03BD",
|
||||
"\u03BF",
|
||||
"\u03C0",
|
||||
"\u03B8",
|
||||
"\u03C1",
|
||||
"\u03C3",
|
||||
"\u03C4",
|
||||
"\u03C5",
|
||||
"\u03D6",
|
||||
"\u03C9",
|
||||
"\u03BE",
|
||||
"\u03C8",
|
||||
"\u03B6",
|
||||
"\u007B",
|
||||
"\u007C",
|
||||
"\u007D",
|
||||
"\u223C",
|
||||
"\u007F",
|
||||
"\u0080",
|
||||
"\u0081",
|
||||
"\u0082",
|
||||
"\u0083",
|
||||
"\u0084",
|
||||
"\u0085",
|
||||
"\u0086",
|
||||
"\u0087",
|
||||
"\u0088",
|
||||
"\u0089",
|
||||
"\u008A",
|
||||
"\u008B",
|
||||
"\u008C",
|
||||
"\u008D",
|
||||
"\u008E",
|
||||
"\u008F",
|
||||
"\u0090",
|
||||
"\u0091",
|
||||
"\u0092",
|
||||
"\u0093",
|
||||
"\u0094",
|
||||
"\u0095",
|
||||
"\u0096",
|
||||
"\u0097",
|
||||
"\u0098",
|
||||
"\u0099",
|
||||
"\u009A",
|
||||
"\u009B",
|
||||
"\u009C",
|
||||
"\u009D",
|
||||
"\u009E",
|
||||
"\u009F",
|
||||
"\u20AC",
|
||||
"\u03D2",
|
||||
"\u2032",
|
||||
"\u2264",
|
||||
"\u2044",
|
||||
"\u221E",
|
||||
"\u0192",
|
||||
"\u2663",
|
||||
"\u2666",
|
||||
"\u2665",
|
||||
"\u2660",
|
||||
"\u2194",
|
||||
"\u2190",
|
||||
"\u2191",
|
||||
"\u2192",
|
||||
"\u2193",
|
||||
"\u00B0",
|
||||
"\u00B1",
|
||||
"\u2033",
|
||||
"\u2265",
|
||||
"\u00D7",
|
||||
"\u221D",
|
||||
"\u2202",
|
||||
"\u2022",
|
||||
"\u00F7",
|
||||
"\u2260",
|
||||
"\u2261",
|
||||
"\u2248",
|
||||
"\u2026",
|
||||
"\uF8E6",
|
||||
"\uF8E7",
|
||||
"\u21B5",
|
||||
"\u2135",
|
||||
"\u2111",
|
||||
"\u211C",
|
||||
"\u2118",
|
||||
"\u2297",
|
||||
"\u2295",
|
||||
"\u2205",
|
||||
"\u2229",
|
||||
"\u222A",
|
||||
"\u2283",
|
||||
"\u2287",
|
||||
"\u2284",
|
||||
"\u2282",
|
||||
"\u2286",
|
||||
"\u2208",
|
||||
"\u2209",
|
||||
"\u2220",
|
||||
"\u2207",
|
||||
"\uF6DA",
|
||||
"\uF6D9",
|
||||
"\uF6DB",
|
||||
"\u220F",
|
||||
"\u221A",
|
||||
"\u22C5",
|
||||
"\u00AC",
|
||||
"\u2227",
|
||||
"\u2228",
|
||||
"\u21D4",
|
||||
"\u21D0",
|
||||
"\u21D1",
|
||||
"\u21D2",
|
||||
"\u21D3",
|
||||
"\u25CA",
|
||||
"\u2329",
|
||||
"\uF8E8",
|
||||
"\uF8E9",
|
||||
"\uF8EA",
|
||||
"\u2211",
|
||||
"\uF8EB",
|
||||
"\uF8EC",
|
||||
"\uF8ED",
|
||||
"\uF8EE",
|
||||
"\uF8EF",
|
||||
"\uF8F0",
|
||||
"\uF8F1",
|
||||
"\uF8F2",
|
||||
"\uF8F3",
|
||||
"\uF8F4",
|
||||
"\u00F0",
|
||||
"\u232A",
|
||||
"\u222B",
|
||||
"\u2320",
|
||||
"\uF8F5",
|
||||
"\u2321",
|
||||
"\uF8F6",
|
||||
"\uF8F7",
|
||||
"\uF8F8",
|
||||
"\uF8F9",
|
||||
"\uF8FA",
|
||||
"\uF8FB",
|
||||
"\uF8FC",
|
||||
"\uF8FD",
|
||||
"\uF8FE",
|
||||
"\u00FF",
|
||||
]
|
||||
assert len(_symbol_encoding) == 256
|
||||
261
venv/lib/python3.12/site-packages/pypdf/_codecs/zapfding.py
Normal file
261
venv/lib/python3.12/site-packages/pypdf/_codecs/zapfding.py
Normal file
@ -0,0 +1,261 @@
|
||||
# manually generated from https://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/zdingbat.txt
|
||||
|
||||
_zapfding_encoding = [
|
||||
"\u0000",
|
||||
"\u0001",
|
||||
"\u0002",
|
||||
"\u0003",
|
||||
"\u0004",
|
||||
"\u0005",
|
||||
"\u0006",
|
||||
"\u0007",
|
||||
"\u0008",
|
||||
"\u0009",
|
||||
"\u000A",
|
||||
"\u000B",
|
||||
"\u000C",
|
||||
"\u000D",
|
||||
"\u000E",
|
||||
"\u000F",
|
||||
"\u0010",
|
||||
"\u0011",
|
||||
"\u0012",
|
||||
"\u0013",
|
||||
"\u0014",
|
||||
"\u0015",
|
||||
"\u0016",
|
||||
"\u0017",
|
||||
"\u0018",
|
||||
"\u0019",
|
||||
"\u001A",
|
||||
"\u001B",
|
||||
"\u001C",
|
||||
"\u001D",
|
||||
"\u001E",
|
||||
"\u001F",
|
||||
"\u0020",
|
||||
"\u2701",
|
||||
"\u2702",
|
||||
"\u2703",
|
||||
"\u2704",
|
||||
"\u260E",
|
||||
"\u2706",
|
||||
"\u2707",
|
||||
"\u2708",
|
||||
"\u2709",
|
||||
"\u261B",
|
||||
"\u261E",
|
||||
"\u270C",
|
||||
"\u270D",
|
||||
"\u270E",
|
||||
"\u270F",
|
||||
"\u2710",
|
||||
"\u2711",
|
||||
"\u2712",
|
||||
"\u2713",
|
||||
"\u2714",
|
||||
"\u2715",
|
||||
"\u2716",
|
||||
"\u2717",
|
||||
"\u2718",
|
||||
"\u2719",
|
||||
"\u271A",
|
||||
"\u271B",
|
||||
"\u271C",
|
||||
"\u271D",
|
||||
"\u271E",
|
||||
"\u271F",
|
||||
"\u2720",
|
||||
"\u2721",
|
||||
"\u2722",
|
||||
"\u2723",
|
||||
"\u2724",
|
||||
"\u2725",
|
||||
"\u2726",
|
||||
"\u2727",
|
||||
"\u2605",
|
||||
"\u2729",
|
||||
"\u272A",
|
||||
"\u272B",
|
||||
"\u272C",
|
||||
"\u272D",
|
||||
"\u272E",
|
||||
"\u272F",
|
||||
"\u2730",
|
||||
"\u2731",
|
||||
"\u2732",
|
||||
"\u2733",
|
||||
"\u2734",
|
||||
"\u2735",
|
||||
"\u2736",
|
||||
"\u2737",
|
||||
"\u2738",
|
||||
"\u2739",
|
||||
"\u273A",
|
||||
"\u273B",
|
||||
"\u273C",
|
||||
"\u273D",
|
||||
"\u273E",
|
||||
"\u273F",
|
||||
"\u2740",
|
||||
"\u2741",
|
||||
"\u2742",
|
||||
"\u2743",
|
||||
"\u2744",
|
||||
"\u2745",
|
||||
"\u2746",
|
||||
"\u2747",
|
||||
"\u2748",
|
||||
"\u2749",
|
||||
"\u274A",
|
||||
"\u274B",
|
||||
"\u25CF",
|
||||
"\u274D",
|
||||
"\u25A0",
|
||||
"\u274F",
|
||||
"\u2750",
|
||||
"\u2751",
|
||||
"\u2752",
|
||||
"\u25B2",
|
||||
"\u25BC",
|
||||
"\u25C6",
|
||||
"\u2756",
|
||||
"\u25D7",
|
||||
"\u2758",
|
||||
"\u2759",
|
||||
"\u275A",
|
||||
"\u275B",
|
||||
"\u275C",
|
||||
"\u275D",
|
||||
"\u275E",
|
||||
"\u007F",
|
||||
"\uF8D7",
|
||||
"\uF8D8",
|
||||
"\uF8D9",
|
||||
"\uF8DA",
|
||||
"\uF8DB",
|
||||
"\uF8DC",
|
||||
"\uF8DD",
|
||||
"\uF8DE",
|
||||
"\uF8DF",
|
||||
"\uF8E0",
|
||||
"\uF8E1",
|
||||
"\uF8E2",
|
||||
"\uF8E3",
|
||||
"\uF8E4",
|
||||
"\u008E",
|
||||
"\u008F",
|
||||
"\u0090",
|
||||
"\u0091",
|
||||
"\u0092",
|
||||
"\u0093",
|
||||
"\u0094",
|
||||
"\u0095",
|
||||
"\u0096",
|
||||
"\u0097",
|
||||
"\u0098",
|
||||
"\u0099",
|
||||
"\u009A",
|
||||
"\u009B",
|
||||
"\u009C",
|
||||
"\u009D",
|
||||
"\u009E",
|
||||
"\u009F",
|
||||
"\u00A0",
|
||||
"\u2761",
|
||||
"\u2762",
|
||||
"\u2763",
|
||||
"\u2764",
|
||||
"\u2765",
|
||||
"\u2766",
|
||||
"\u2767",
|
||||
"\u2663",
|
||||
"\u2666",
|
||||
"\u2665",
|
||||
"\u2660",
|
||||
"\u2460",
|
||||
"\u2461",
|
||||
"\u2462",
|
||||
"\u2463",
|
||||
"\u2464",
|
||||
"\u2465",
|
||||
"\u2466",
|
||||
"\u2467",
|
||||
"\u2468",
|
||||
"\u2469",
|
||||
"\u2776",
|
||||
"\u2777",
|
||||
"\u2778",
|
||||
"\u2779",
|
||||
"\u277A",
|
||||
"\u277B",
|
||||
"\u277C",
|
||||
"\u277D",
|
||||
"\u277E",
|
||||
"\u277F",
|
||||
"\u2780",
|
||||
"\u2781",
|
||||
"\u2782",
|
||||
"\u2783",
|
||||
"\u2784",
|
||||
"\u2785",
|
||||
"\u2786",
|
||||
"\u2787",
|
||||
"\u2788",
|
||||
"\u2789",
|
||||
"\u278A",
|
||||
"\u278B",
|
||||
"\u278C",
|
||||
"\u278D",
|
||||
"\u278E",
|
||||
"\u278F",
|
||||
"\u2790",
|
||||
"\u2791",
|
||||
"\u2792",
|
||||
"\u2793",
|
||||
"\u2794",
|
||||
"\u2192",
|
||||
"\u2194",
|
||||
"\u2195",
|
||||
"\u2798",
|
||||
"\u2799",
|
||||
"\u279A",
|
||||
"\u279B",
|
||||
"\u279C",
|
||||
"\u279D",
|
||||
"\u279E",
|
||||
"\u279F",
|
||||
"\u27A0",
|
||||
"\u27A1",
|
||||
"\u27A2",
|
||||
"\u27A3",
|
||||
"\u27A4",
|
||||
"\u27A5",
|
||||
"\u27A6",
|
||||
"\u27A7",
|
||||
"\u27A8",
|
||||
"\u27A9",
|
||||
"\u27AA",
|
||||
"\u27AB",
|
||||
"\u27AC",
|
||||
"\u27AD",
|
||||
"\u27AE",
|
||||
"\u27AF",
|
||||
"\u00F0",
|
||||
"\u27B1",
|
||||
"\u27B2",
|
||||
"\u27B3",
|
||||
"\u27B4",
|
||||
"\u27B5",
|
||||
"\u27B6",
|
||||
"\u27B7",
|
||||
"\u27B8",
|
||||
"\u27B9",
|
||||
"\u27BA",
|
||||
"\u27BB",
|
||||
"\u27BC",
|
||||
"\u27BD",
|
||||
"\u27BE",
|
||||
"\u00FF",
|
||||
]
|
||||
assert len(_zapfding_encoding) == 256
|
||||
@ -0,0 +1,86 @@
|
||||
# Copyright (c) 2023, exiledkingcc
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from pypdf._crypt_providers._base import CryptBase, CryptIdentity
|
||||
|
||||
try:
|
||||
from pypdf._crypt_providers._cryptography import (
|
||||
CryptAES,
|
||||
CryptRC4,
|
||||
aes_cbc_decrypt,
|
||||
aes_cbc_encrypt,
|
||||
aes_ecb_decrypt,
|
||||
aes_ecb_encrypt,
|
||||
crypt_provider,
|
||||
rc4_decrypt,
|
||||
rc4_encrypt,
|
||||
)
|
||||
from pypdf._utils import Version
|
||||
|
||||
if Version(crypt_provider[1]) <= Version("3.0"):
|
||||
# This is due to the backend parameter being required back then:
|
||||
# https://cryptography.io/en/latest/changelog/#v3-1
|
||||
raise ImportError("cryptography<=3.0 is not supported") # pragma: no cover
|
||||
except ImportError:
|
||||
try:
|
||||
from pypdf._crypt_providers._pycryptodome import ( # type: ignore
|
||||
CryptAES,
|
||||
CryptRC4,
|
||||
aes_cbc_decrypt,
|
||||
aes_cbc_encrypt,
|
||||
aes_ecb_decrypt,
|
||||
aes_ecb_encrypt,
|
||||
crypt_provider,
|
||||
rc4_decrypt,
|
||||
rc4_encrypt,
|
||||
)
|
||||
except ImportError:
|
||||
from pypdf._crypt_providers._fallback import ( # type: ignore
|
||||
CryptAES,
|
||||
CryptRC4,
|
||||
aes_cbc_decrypt,
|
||||
aes_cbc_encrypt,
|
||||
aes_ecb_decrypt,
|
||||
aes_ecb_encrypt,
|
||||
crypt_provider,
|
||||
rc4_decrypt,
|
||||
rc4_encrypt,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"CryptAES",
|
||||
"CryptBase",
|
||||
"CryptIdentity",
|
||||
"CryptRC4",
|
||||
"aes_cbc_decrypt",
|
||||
"aes_cbc_encrypt",
|
||||
"aes_ecb_decrypt",
|
||||
"aes_ecb_encrypt",
|
||||
"crypt_provider",
|
||||
"rc4_decrypt",
|
||||
"rc4_encrypt",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,38 @@
|
||||
# Copyright (c) 2023, exiledkingcc
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
class CryptBase:
|
||||
def encrypt(self, data: bytes) -> bytes: # pragma: no cover
|
||||
return data
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes: # pragma: no cover
|
||||
return data
|
||||
|
||||
|
||||
class CryptIdentity(CryptBase):
|
||||
pass
|
||||
@ -0,0 +1,118 @@
|
||||
# Copyright (c) 2023, exiledkingcc
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import secrets
|
||||
|
||||
from cryptography import __version__
|
||||
from cryptography.hazmat.primitives import padding
|
||||
from cryptography.hazmat.primitives.ciphers.algorithms import AES
|
||||
|
||||
try:
|
||||
# 43.0.0 - https://cryptography.io/en/latest/changelog/#v43-0-0
|
||||
from cryptography.hazmat.decrepit.ciphers.algorithms import ARC4
|
||||
except ImportError:
|
||||
from cryptography.hazmat.primitives.ciphers.algorithms import ARC4
|
||||
from cryptography.hazmat.primitives.ciphers.base import Cipher
|
||||
from cryptography.hazmat.primitives.ciphers.modes import CBC, ECB
|
||||
|
||||
from pypdf._crypt_providers._base import CryptBase
|
||||
|
||||
crypt_provider = ("cryptography", __version__)
|
||||
|
||||
|
||||
class CryptRC4(CryptBase):
|
||||
def __init__(self, key: bytes) -> None:
|
||||
self.cipher = Cipher(ARC4(key), mode=None)
|
||||
|
||||
def encrypt(self, data: bytes) -> bytes:
|
||||
encryptor = self.cipher.encryptor()
|
||||
return encryptor.update(data) + encryptor.finalize()
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes:
|
||||
decryptor = self.cipher.decryptor()
|
||||
return decryptor.update(data) + decryptor.finalize()
|
||||
|
||||
|
||||
class CryptAES(CryptBase):
|
||||
def __init__(self, key: bytes) -> None:
|
||||
self.alg = AES(key)
|
||||
|
||||
def encrypt(self, data: bytes) -> bytes:
|
||||
iv = secrets.token_bytes(16)
|
||||
pad = padding.PKCS7(128).padder()
|
||||
data = pad.update(data) + pad.finalize()
|
||||
|
||||
cipher = Cipher(self.alg, CBC(iv))
|
||||
encryptor = cipher.encryptor()
|
||||
return iv + encryptor.update(data) + encryptor.finalize()
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes:
|
||||
iv = data[:16]
|
||||
data = data[16:]
|
||||
# for empty encrypted data
|
||||
if not data:
|
||||
return data
|
||||
|
||||
# just for robustness, it does not happen under normal circumstances
|
||||
if len(data) % 16 != 0:
|
||||
pad = padding.PKCS7(128).padder()
|
||||
data = pad.update(data) + pad.finalize()
|
||||
|
||||
cipher = Cipher(self.alg, CBC(iv))
|
||||
decryptor = cipher.decryptor()
|
||||
d = decryptor.update(data) + decryptor.finalize()
|
||||
return d[: -d[-1]]
|
||||
|
||||
|
||||
def rc4_encrypt(key: bytes, data: bytes) -> bytes:
|
||||
encryptor = Cipher(ARC4(key), mode=None).encryptor()
|
||||
return encryptor.update(data) + encryptor.finalize()
|
||||
|
||||
|
||||
def rc4_decrypt(key: bytes, data: bytes) -> bytes:
|
||||
decryptor = Cipher(ARC4(key), mode=None).decryptor()
|
||||
return decryptor.update(data) + decryptor.finalize()
|
||||
|
||||
|
||||
def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
|
||||
encryptor = Cipher(AES(key), mode=ECB()).encryptor()
|
||||
return encryptor.update(data) + encryptor.finalize()
|
||||
|
||||
|
||||
def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
|
||||
decryptor = Cipher(AES(key), mode=ECB()).decryptor()
|
||||
return decryptor.update(data) + decryptor.finalize()
|
||||
|
||||
|
||||
def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
encryptor = Cipher(AES(key), mode=CBC(iv)).encryptor()
|
||||
return encryptor.update(data) + encryptor.finalize()
|
||||
|
||||
|
||||
def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
decryptor = Cipher(AES(key), mode=CBC(iv)).decryptor()
|
||||
return decryptor.update(data) + decryptor.finalize()
|
||||
@ -0,0 +1,93 @@
|
||||
# Copyright (c) 2023, exiledkingcc
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from pypdf._crypt_providers._base import CryptBase
|
||||
from pypdf.errors import DependencyError
|
||||
|
||||
_DEPENDENCY_ERROR_STR = "cryptography>=3.1 is required for AES algorithm"
|
||||
|
||||
|
||||
crypt_provider = ("local_crypt_fallback", "0.0.0")
|
||||
|
||||
|
||||
class CryptRC4(CryptBase):
|
||||
def __init__(self, key: bytes) -> None:
|
||||
self.s = bytearray(range(256))
|
||||
j = 0
|
||||
for i in range(256):
|
||||
j = (j + self.s[i] + key[i % len(key)]) % 256
|
||||
self.s[i], self.s[j] = self.s[j], self.s[i]
|
||||
|
||||
def encrypt(self, data: bytes) -> bytes:
|
||||
s = bytearray(self.s)
|
||||
out = [0 for _ in range(len(data))]
|
||||
i, j = 0, 0
|
||||
for k in range(len(data)):
|
||||
i = (i + 1) % 256
|
||||
j = (j + s[i]) % 256
|
||||
s[i], s[j] = s[j], s[i]
|
||||
x = s[(s[i] + s[j]) % 256]
|
||||
out[k] = data[k] ^ x
|
||||
return bytes(out)
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes:
|
||||
return self.encrypt(data)
|
||||
|
||||
|
||||
class CryptAES(CryptBase):
|
||||
def __init__(self, key: bytes) -> None:
|
||||
pass
|
||||
|
||||
def encrypt(self, data: bytes) -> bytes:
|
||||
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes:
|
||||
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||
|
||||
|
||||
def rc4_encrypt(key: bytes, data: bytes) -> bytes:
|
||||
return CryptRC4(key).encrypt(data)
|
||||
|
||||
|
||||
def rc4_decrypt(key: bytes, data: bytes) -> bytes:
|
||||
return CryptRC4(key).decrypt(data)
|
||||
|
||||
|
||||
def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
|
||||
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||
|
||||
|
||||
def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
|
||||
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||
|
||||
|
||||
def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||
|
||||
|
||||
def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
raise DependencyError(_DEPENDENCY_ERROR_STR)
|
||||
@ -0,0 +1,97 @@
|
||||
# Copyright (c) 2023, exiledkingcc
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import secrets
|
||||
|
||||
from Crypto import __version__
|
||||
from Crypto.Cipher import AES, ARC4
|
||||
from Crypto.Util.Padding import pad
|
||||
|
||||
from pypdf._crypt_providers._base import CryptBase
|
||||
|
||||
crypt_provider = ("pycryptodome", __version__)
|
||||
|
||||
|
||||
class CryptRC4(CryptBase):
|
||||
def __init__(self, key: bytes) -> None:
|
||||
self.key = key
|
||||
|
||||
def encrypt(self, data: bytes) -> bytes:
|
||||
return ARC4.ARC4Cipher(self.key).encrypt(data)
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes:
|
||||
return ARC4.ARC4Cipher(self.key).decrypt(data)
|
||||
|
||||
|
||||
class CryptAES(CryptBase):
|
||||
def __init__(self, key: bytes) -> None:
|
||||
self.key = key
|
||||
|
||||
def encrypt(self, data: bytes) -> bytes:
|
||||
iv = secrets.token_bytes(16)
|
||||
data = pad(data, 16)
|
||||
aes = AES.new(self.key, AES.MODE_CBC, iv)
|
||||
return iv + aes.encrypt(data)
|
||||
|
||||
def decrypt(self, data: bytes) -> bytes:
|
||||
iv = data[:16]
|
||||
data = data[16:]
|
||||
# for empty encrypted data
|
||||
if not data:
|
||||
return data
|
||||
|
||||
# just for robustness, it does not happen under normal circumstances
|
||||
if len(data) % 16 != 0:
|
||||
data = pad(data, 16)
|
||||
|
||||
aes = AES.new(self.key, AES.MODE_CBC, iv)
|
||||
d = aes.decrypt(data)
|
||||
return d[: -d[-1]]
|
||||
|
||||
|
||||
def rc4_encrypt(key: bytes, data: bytes) -> bytes:
|
||||
return ARC4.ARC4Cipher(key).encrypt(data)
|
||||
|
||||
|
||||
def rc4_decrypt(key: bytes, data: bytes) -> bytes:
|
||||
return ARC4.ARC4Cipher(key).decrypt(data)
|
||||
|
||||
|
||||
def aes_ecb_encrypt(key: bytes, data: bytes) -> bytes:
|
||||
return AES.new(key, AES.MODE_ECB).encrypt(data)
|
||||
|
||||
|
||||
def aes_ecb_decrypt(key: bytes, data: bytes) -> bytes:
|
||||
return AES.new(key, AES.MODE_ECB).decrypt(data)
|
||||
|
||||
|
||||
def aes_cbc_encrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
return AES.new(key, AES.MODE_CBC, iv).encrypt(data)
|
||||
|
||||
|
||||
def aes_cbc_decrypt(key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
return AES.new(key, AES.MODE_CBC, iv).decrypt(data)
|
||||
1446
venv/lib/python3.12/site-packages/pypdf/_doc_common.py
Normal file
1446
venv/lib/python3.12/site-packages/pypdf/_doc_common.py
Normal file
File diff suppressed because it is too large
Load Diff
1178
venv/lib/python3.12/site-packages/pypdf/_encryption.py
Normal file
1178
venv/lib/python3.12/site-packages/pypdf/_encryption.py
Normal file
File diff suppressed because it is too large
Load Diff
42
venv/lib/python3.12/site-packages/pypdf/_merger.py
Normal file
42
venv/lib/python3.12/site-packages/pypdf/_merger.py
Normal file
@ -0,0 +1,42 @@
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
from ._utils import (
|
||||
deprecation_with_replacement,
|
||||
)
|
||||
|
||||
|
||||
class PdfMerger:
|
||||
"""
|
||||
Use :class:`PdfWriter` instead.
|
||||
|
||||
.. deprecated:: 5.0.0
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
deprecation_with_replacement("PdfMerger", "PdfWriter", "5.0.0")
|
||||
2686
venv/lib/python3.12/site-packages/pypdf/_page.py
Normal file
2686
venv/lib/python3.12/site-packages/pypdf/_page.py
Normal file
File diff suppressed because it is too large
Load Diff
289
venv/lib/python3.12/site-packages/pypdf/_page_labels.py
Normal file
289
venv/lib/python3.12/site-packages/pypdf/_page_labels.py
Normal file
@ -0,0 +1,289 @@
|
||||
"""
|
||||
Page labels are shown by PDF viewers as "the page number".
|
||||
|
||||
A page has a numeric index, starting at 0. Additionally, the page
|
||||
has a label. In the most simple case:
|
||||
|
||||
label = index + 1
|
||||
|
||||
However, the title page and the table of contents might have Roman numerals as
|
||||
page labels. This makes things more complicated.
|
||||
|
||||
Example 1
|
||||
---------
|
||||
|
||||
>>> reader.root_object["/PageLabels"]["/Nums"]
|
||||
[0, IndirectObject(18, 0, 139929798197504),
|
||||
8, IndirectObject(19, 0, 139929798197504)]
|
||||
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
|
||||
{'/S': '/r'}
|
||||
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
|
||||
{'/S': '/D'}
|
||||
|
||||
Example 2
|
||||
---------
|
||||
The following is a document with pages labeled
|
||||
i, ii, iii, iv, 1, 2, 3, A-8, A-9, ...
|
||||
|
||||
1 0 obj
|
||||
<< /Type /Catalog
|
||||
/PageLabels << /Nums [
|
||||
0 << /S /r >>
|
||||
4 << /S /D >>
|
||||
7 << /S /D
|
||||
/P ( A- )
|
||||
/St 8
|
||||
>>
|
||||
% A number tree containing
|
||||
% three page label dictionaries
|
||||
]
|
||||
>>
|
||||
...
|
||||
>>
|
||||
endobj
|
||||
|
||||
|
||||
§12.4.2 PDF Specification 1.7 and 2.0
|
||||
=====================================
|
||||
|
||||
Entries in a page label dictionary
|
||||
----------------------------------
|
||||
The /S key:
|
||||
D Decimal Arabic numerals
|
||||
R Uppercase Roman numerals
|
||||
r Lowercase Roman numerals
|
||||
A Uppercase letters (A to Z for the first 26 pages,
|
||||
AA to ZZ for the next 26, and so on)
|
||||
a Lowercase letters (a to z for the first 26 pages,
|
||||
aa to zz for the next 26, and so on)
|
||||
"""
|
||||
|
||||
from typing import Iterator, List, Optional, Tuple, cast
|
||||
|
||||
from ._protocols import PdfCommonDocProtocol
|
||||
from ._utils import logger_warning
|
||||
from .generic import (
|
||||
ArrayObject,
|
||||
DictionaryObject,
|
||||
NullObject,
|
||||
NumberObject,
|
||||
is_null_or_none,
|
||||
)
|
||||
|
||||
|
||||
def number2uppercase_roman_numeral(num: int) -> str:
|
||||
roman = [
|
||||
(1000, "M"),
|
||||
(900, "CM"),
|
||||
(500, "D"),
|
||||
(400, "CD"),
|
||||
(100, "C"),
|
||||
(90, "XC"),
|
||||
(50, "L"),
|
||||
(40, "XL"),
|
||||
(10, "X"),
|
||||
(9, "IX"),
|
||||
(5, "V"),
|
||||
(4, "IV"),
|
||||
(1, "I"),
|
||||
]
|
||||
|
||||
def roman_num(num: int) -> Iterator[str]:
|
||||
for decimal, roman_repr in roman:
|
||||
x, _ = divmod(num, decimal)
|
||||
yield roman_repr * x
|
||||
num -= decimal * x
|
||||
if num <= 0:
|
||||
break
|
||||
|
||||
return "".join(list(roman_num(num)))
|
||||
|
||||
|
||||
def number2lowercase_roman_numeral(number: int) -> str:
|
||||
return number2uppercase_roman_numeral(number).lower()
|
||||
|
||||
|
||||
def number2uppercase_letter(number: int) -> str:
|
||||
if number <= 0:
|
||||
raise ValueError("Expecting a positive number")
|
||||
alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
|
||||
rep = ""
|
||||
while number > 0:
|
||||
remainder = number % 26
|
||||
if remainder == 0:
|
||||
remainder = 26
|
||||
rep = alphabet[remainder - 1] + rep
|
||||
# update
|
||||
number -= remainder
|
||||
number = number // 26
|
||||
return rep
|
||||
|
||||
|
||||
def number2lowercase_letter(number: int) -> str:
|
||||
return number2uppercase_letter(number).lower()
|
||||
|
||||
|
||||
def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str:
|
||||
# [Nums] shall be an array of the form
|
||||
# [ key_1 value_1 key_2 value_2 ... key_n value_n ]
|
||||
# where each key_i is an integer and the corresponding
|
||||
# value_i shall be the object associated with that key.
|
||||
# The keys shall be sorted in numerical order,
|
||||
# analogously to the arrangement of keys in a name tree
|
||||
# as described in 7.9.6, "Name Trees."
|
||||
nums = cast(ArrayObject, dictionary_object["/Nums"])
|
||||
i = 0
|
||||
value = None
|
||||
start_index = 0
|
||||
while i < len(nums):
|
||||
start_index = nums[i]
|
||||
value = nums[i + 1].get_object()
|
||||
if i + 2 == len(nums):
|
||||
break
|
||||
if nums[i + 2] > index:
|
||||
break
|
||||
i += 2
|
||||
m = {
|
||||
None: lambda n: "",
|
||||
"/D": lambda n: str(n),
|
||||
"/R": number2uppercase_roman_numeral,
|
||||
"/r": number2lowercase_roman_numeral,
|
||||
"/A": number2uppercase_letter,
|
||||
"/a": number2lowercase_letter,
|
||||
}
|
||||
# if /Nums array is not following the specification or if /Nums is empty
|
||||
if not isinstance(value, dict):
|
||||
return str(index + 1) # Fallback
|
||||
start = value.get("/St", 1)
|
||||
prefix = value.get("/P", "")
|
||||
return prefix + m[value.get("/S")](index - start_index + start)
|
||||
|
||||
|
||||
def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
|
||||
"""
|
||||
See 7.9.7 "Number Trees".
|
||||
|
||||
Args:
|
||||
reader: The PdfReader
|
||||
index: The index of the page
|
||||
|
||||
Returns:
|
||||
The label of the page, e.g. "iv" or "4".
|
||||
|
||||
"""
|
||||
root = cast(DictionaryObject, reader.root_object)
|
||||
if "/PageLabels" not in root:
|
||||
return str(index + 1) # Fallback
|
||||
number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
|
||||
if "/Nums" in number_tree:
|
||||
return get_label_from_nums(number_tree, index)
|
||||
if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject):
|
||||
# number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]}
|
||||
# Limit maximum depth.
|
||||
level = 0
|
||||
while level < 100:
|
||||
kids = cast(List[DictionaryObject], number_tree["/Kids"])
|
||||
for kid in kids:
|
||||
# kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
|
||||
limits = cast(List[int], kid["/Limits"])
|
||||
if limits[0] <= index <= limits[1]:
|
||||
if not is_null_or_none(kid.get("/Kids", None)):
|
||||
# Recursive definition.
|
||||
level += 1
|
||||
if level == 100: # pragma: no cover
|
||||
raise NotImplementedError(
|
||||
"Too deep nesting is not supported."
|
||||
)
|
||||
number_tree = kid
|
||||
# Exit the inner `for` loop and continue at the next level with the
|
||||
# next iteration of the `while` loop.
|
||||
break
|
||||
return get_label_from_nums(kid, index)
|
||||
else:
|
||||
# When there are no kids, make sure to exit the `while` loop directly
|
||||
# and continue with the fallback.
|
||||
break
|
||||
|
||||
logger_warning(f"Could not reliably determine page label for {index}.", __name__)
|
||||
return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree
|
||||
|
||||
|
||||
def nums_insert(
|
||||
key: NumberObject,
|
||||
value: DictionaryObject,
|
||||
nums: ArrayObject,
|
||||
) -> None:
|
||||
"""
|
||||
Insert a key, value pair in a Nums array.
|
||||
|
||||
See 7.9.7 "Number Trees".
|
||||
|
||||
Args:
|
||||
key: number key of the entry
|
||||
value: value of the entry
|
||||
nums: Nums array to modify
|
||||
|
||||
"""
|
||||
if len(nums) % 2 != 0:
|
||||
raise ValueError("A nums like array must have an even number of elements")
|
||||
|
||||
i = len(nums)
|
||||
while i != 0 and key <= nums[i - 2]:
|
||||
i = i - 2
|
||||
|
||||
if i < len(nums) and key == nums[i]:
|
||||
nums[i + 1] = value
|
||||
else:
|
||||
nums.insert(i, key)
|
||||
nums.insert(i + 1, value)
|
||||
|
||||
|
||||
def nums_clear_range(
|
||||
key: NumberObject,
|
||||
page_index_to: int,
|
||||
nums: ArrayObject,
|
||||
) -> None:
|
||||
"""
|
||||
Remove all entries in a number tree in a range after an entry.
|
||||
|
||||
See 7.9.7 "Number Trees".
|
||||
|
||||
Args:
|
||||
key: number key of the entry before the range
|
||||
page_index_to: The page index of the upper limit of the range
|
||||
nums: Nums array to modify
|
||||
|
||||
"""
|
||||
if len(nums) % 2 != 0:
|
||||
raise ValueError("A nums like array must have an even number of elements")
|
||||
if page_index_to < key:
|
||||
raise ValueError("page_index_to must be greater or equal than key")
|
||||
|
||||
i = nums.index(key) + 2
|
||||
while i < len(nums) and nums[i] <= page_index_to:
|
||||
nums.pop(i)
|
||||
nums.pop(i)
|
||||
|
||||
|
||||
def nums_next(
|
||||
key: NumberObject,
|
||||
nums: ArrayObject,
|
||||
) -> Tuple[Optional[NumberObject], Optional[DictionaryObject]]:
|
||||
"""
|
||||
Return the (key, value) pair of the entry after the given one.
|
||||
|
||||
See 7.9.7 "Number Trees".
|
||||
|
||||
Args:
|
||||
key: number key of the entry
|
||||
nums: Nums array
|
||||
|
||||
"""
|
||||
if len(nums) % 2 != 0:
|
||||
raise ValueError("A nums like array must have an even number of elements")
|
||||
|
||||
i = nums.index(key) + 2
|
||||
if i < len(nums):
|
||||
return (nums[i], nums[i + 1])
|
||||
else:
|
||||
return (None, None)
|
||||
86
venv/lib/python3.12/site-packages/pypdf/_protocols.py
Normal file
86
venv/lib/python3.12/site-packages/pypdf/_protocols.py
Normal file
@ -0,0 +1,86 @@
|
||||
"""Helpers for working with PDF types."""
|
||||
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import IO, Any, Dict, List, Optional, Protocol, Tuple, Union
|
||||
|
||||
from ._utils import StrByteType, StreamType
|
||||
|
||||
|
||||
class PdfObjectProtocol(Protocol):
|
||||
indirect_reference: Any
|
||||
|
||||
def clone(
|
||||
self,
|
||||
pdf_dest: Any,
|
||||
force_duplicate: bool = False,
|
||||
ignore_fields: Union[Tuple[str, ...], List[str], None] = (),
|
||||
) -> Any:
|
||||
... # pragma: no cover
|
||||
|
||||
def _reference_clone(self, clone: Any, pdf_dest: Any) -> Any:
|
||||
... # pragma: no cover
|
||||
|
||||
def get_object(self) -> Optional["PdfObjectProtocol"]:
|
||||
... # pragma: no cover
|
||||
|
||||
def hash_value(self) -> bytes:
|
||||
... # pragma: no cover
|
||||
|
||||
def write_to_stream(
|
||||
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
|
||||
) -> None:
|
||||
... # pragma: no cover
|
||||
|
||||
|
||||
class XmpInformationProtocol(PdfObjectProtocol):
|
||||
pass
|
||||
|
||||
|
||||
class PdfCommonDocProtocol(Protocol):
|
||||
@property
|
||||
def pdf_header(self) -> str:
|
||||
... # pragma: no cover
|
||||
|
||||
@property
|
||||
def pages(self) -> List[Any]:
|
||||
... # pragma: no cover
|
||||
|
||||
@property
|
||||
def root_object(self) -> PdfObjectProtocol:
|
||||
... # pragma: no cover
|
||||
|
||||
def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
|
||||
... # pragma: no cover
|
||||
|
||||
@property
|
||||
def strict(self) -> bool:
|
||||
... # pragma: no cover
|
||||
|
||||
|
||||
class PdfReaderProtocol(PdfCommonDocProtocol, Protocol):
|
||||
@property
|
||||
@abstractmethod
|
||||
def xref(self) -> Dict[int, Dict[int, Any]]:
|
||||
... # pragma: no cover
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def trailer(self) -> Dict[str, Any]:
|
||||
... # pragma: no cover
|
||||
|
||||
|
||||
class PdfWriterProtocol(PdfCommonDocProtocol, Protocol):
|
||||
_objects: List[Any]
|
||||
_id_translated: Dict[int, Dict[int, int]]
|
||||
|
||||
incremental: bool
|
||||
_reader: Any # PdfReader
|
||||
|
||||
@abstractmethod
|
||||
def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
|
||||
... # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
def _add_object(self, obj: Any) -> Any:
|
||||
... # pragma: no cover
|
||||
1273
venv/lib/python3.12/site-packages/pypdf/_reader.py
Normal file
1273
venv/lib/python3.12/site-packages/pypdf/_reader.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,248 @@
|
||||
"""
|
||||
Code related to text extraction.
|
||||
|
||||
Some parts are still in _page.py. In doubt, they will stay there.
|
||||
"""
|
||||
|
||||
import math
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
|
||||
|
||||
CUSTOM_RTL_MIN: int = -1
|
||||
CUSTOM_RTL_MAX: int = -1
|
||||
CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
|
||||
LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
|
||||
|
||||
|
||||
class OrientationNotFoundError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def set_custom_rtl(
|
||||
_min: Union[str, int, None] = None,
|
||||
_max: Union[str, int, None] = None,
|
||||
specials: Union[str, List[int], None] = None,
|
||||
) -> Tuple[int, int, List[int]]:
|
||||
"""
|
||||
Change the Right-To-Left and special characters custom parameters.
|
||||
|
||||
Args:
|
||||
_min: The new minimum value for the range of custom characters that
|
||||
will be written right to left.
|
||||
If set to ``None``, the value will not be changed.
|
||||
If set to an integer or string, it will be converted to its ASCII code.
|
||||
The default value is -1, which sets no additional range to be converted.
|
||||
_max: The new maximum value for the range of custom characters that will
|
||||
be written right to left.
|
||||
If set to ``None``, the value will not be changed.
|
||||
If set to an integer or string, it will be converted to its ASCII code.
|
||||
The default value is -1, which sets no additional range to be converted.
|
||||
specials: The new list of special characters to be inserted in the
|
||||
current insertion order.
|
||||
If set to ``None``, the current value will not be changed.
|
||||
If set to a string, it will be converted to a list of ASCII codes.
|
||||
The default value is an empty list.
|
||||
|
||||
Returns:
|
||||
A tuple containing the new values for ``CUSTOM_RTL_MIN``,
|
||||
``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
|
||||
|
||||
"""
|
||||
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
|
||||
if isinstance(_min, int):
|
||||
CUSTOM_RTL_MIN = _min
|
||||
elif isinstance(_min, str):
|
||||
CUSTOM_RTL_MIN = ord(_min)
|
||||
if isinstance(_max, int):
|
||||
CUSTOM_RTL_MAX = _max
|
||||
elif isinstance(_max, str):
|
||||
CUSTOM_RTL_MAX = ord(_max)
|
||||
if isinstance(specials, str):
|
||||
CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
|
||||
elif isinstance(specials, list):
|
||||
CUSTOM_RTL_SPECIAL_CHARS = specials
|
||||
return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
|
||||
|
||||
|
||||
def mult(m: List[float], n: List[float]) -> List[float]:
|
||||
return [
|
||||
m[0] * n[0] + m[1] * n[2],
|
||||
m[0] * n[1] + m[1] * n[3],
|
||||
m[2] * n[0] + m[3] * n[2],
|
||||
m[2] * n[1] + m[3] * n[3],
|
||||
m[4] * n[0] + m[5] * n[2] + n[4],
|
||||
m[4] * n[1] + m[5] * n[3] + n[5],
|
||||
]
|
||||
|
||||
|
||||
def orient(m: List[float]) -> int:
|
||||
if m[3] > 1e-6:
|
||||
return 0
|
||||
elif m[3] < -1e-6:
|
||||
return 180
|
||||
elif m[1] > 0:
|
||||
return 90
|
||||
else:
|
||||
return 270
|
||||
|
||||
|
||||
def crlf_space_check(
|
||||
text: str,
|
||||
cmtm_prev: Tuple[List[float], List[float]],
|
||||
cmtm_matrix: Tuple[List[float], List[float]],
|
||||
memo_cmtm: Tuple[List[float], List[float]],
|
||||
cmap: Tuple[
|
||||
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
|
||||
],
|
||||
orientations: Tuple[int, ...],
|
||||
output: str,
|
||||
font_size: float,
|
||||
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
|
||||
str_widths: float,
|
||||
spacewidth: float,
|
||||
str_height: float,
|
||||
) -> Tuple[str, str, List[float], List[float]]:
|
||||
cm_prev = cmtm_prev[0]
|
||||
tm_prev = cmtm_prev[1]
|
||||
cm_matrix = cmtm_matrix[0]
|
||||
tm_matrix = cmtm_matrix[1]
|
||||
memo_cm = memo_cmtm[0]
|
||||
memo_tm = memo_cmtm[1]
|
||||
|
||||
m_prev = mult(tm_prev, cm_prev)
|
||||
m = mult(tm_matrix, cm_matrix)
|
||||
orientation = orient(m)
|
||||
delta_x = m[4] - m_prev[4]
|
||||
delta_y = m[5] - m_prev[5]
|
||||
# Table 108 of the 1.7 reference ("Text positioning operators")
|
||||
scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2)
|
||||
scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2)
|
||||
scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2)
|
||||
cm_prev = m
|
||||
|
||||
if orientation not in orientations:
|
||||
raise OrientationNotFoundError
|
||||
if orientation in (0, 180):
|
||||
moved_height: float = delta_y
|
||||
moved_width: float = delta_x
|
||||
elif orientation in (90, 270):
|
||||
moved_height = delta_x
|
||||
moved_width = delta_y
|
||||
try:
|
||||
if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y):
|
||||
if (output + text)[-1] != "\n":
|
||||
output += text + "\n"
|
||||
if visitor_text is not None:
|
||||
visitor_text(
|
||||
text + "\n",
|
||||
memo_cm,
|
||||
memo_tm,
|
||||
cmap[3],
|
||||
font_size,
|
||||
)
|
||||
text = ""
|
||||
elif (
|
||||
(moved_width >= (spacewidth + str_widths) * scale_prev_x)
|
||||
and (output + text)[-1] != " "
|
||||
):
|
||||
text += " "
|
||||
except Exception:
|
||||
pass
|
||||
tm_prev = tm_matrix.copy()
|
||||
cm_prev = cm_matrix.copy()
|
||||
return text, output, cm_prev, tm_prev
|
||||
|
||||
|
||||
def get_text_operands(
|
||||
operands: List[Union[str, TextStringObject]],
|
||||
cm_matrix: List[float],
|
||||
tm_matrix: List[float],
|
||||
cmap: Tuple[
|
||||
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
|
||||
],
|
||||
orientations: Tuple[int, ...]
|
||||
) -> Tuple[str, bool]:
|
||||
t: str = ""
|
||||
is_str_operands = False
|
||||
m = mult(tm_matrix, cm_matrix)
|
||||
orientation = orient(m)
|
||||
if orientation in orientations and len(operands) > 0:
|
||||
if isinstance(operands[0], str):
|
||||
t = operands[0]
|
||||
is_str_operands = True
|
||||
else:
|
||||
t = ""
|
||||
tt: bytes = (
|
||||
encode_pdfdocencoding(operands[0])
|
||||
if isinstance(operands[0], str)
|
||||
else operands[0]
|
||||
)
|
||||
if isinstance(cmap[0], str):
|
||||
try:
|
||||
t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
|
||||
except Exception:
|
||||
# the data does not match the expectation,
|
||||
# we use the alternative ;
|
||||
# text extraction may not be good
|
||||
t = tt.decode(
|
||||
"utf-16-be" if cmap[0] == "charmap" else "charmap",
|
||||
"surrogatepass",
|
||||
) # apply str encoding
|
||||
else: # apply dict encoding
|
||||
t = "".join(
|
||||
[cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]
|
||||
)
|
||||
return (t, is_str_operands)
|
||||
|
||||
|
||||
def get_display_str(
|
||||
text: str,
|
||||
cm_matrix: List[float],
|
||||
tm_matrix: List[float],
|
||||
cmap: Tuple[
|
||||
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
|
||||
],
|
||||
text_operands: str,
|
||||
font_size: float,
|
||||
rtl_dir: bool,
|
||||
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]]
|
||||
) -> Tuple[str, bool]:
|
||||
# "\u0590 - \u08FF \uFB50 - \uFDFF"
|
||||
for x in [cmap[1].get(x, x) for x in text_operands]:
|
||||
# x can be a sequence of bytes ; ex: habibi.pdf
|
||||
if len(x) == 1:
|
||||
xx = ord(x)
|
||||
else:
|
||||
xx = 1
|
||||
# fmt: off
|
||||
if (
|
||||
# cases where the current inserting order is kept
|
||||
(xx <= 0x2F) # punctuations but...
|
||||
or 0x3A <= xx <= 0x40 # numbers (x30-39)
|
||||
or 0x2000 <= xx <= 0x206F # upper punctuations..
|
||||
or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
|
||||
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
|
||||
):
|
||||
text = x + text if rtl_dir else text + x
|
||||
elif ( # right-to-left characters set
|
||||
0x0590 <= xx <= 0x08FF
|
||||
or 0xFB1D <= xx <= 0xFDFF
|
||||
or 0xFE70 <= xx <= 0xFEFF
|
||||
or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
|
||||
):
|
||||
if not rtl_dir:
|
||||
rtl_dir = True
|
||||
if visitor_text is not None:
|
||||
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
|
||||
text = ""
|
||||
text = x + text
|
||||
else: # left-to-right
|
||||
if rtl_dir:
|
||||
rtl_dir = False
|
||||
if visitor_text is not None:
|
||||
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
|
||||
text = ""
|
||||
text = text + x
|
||||
# fmt: on
|
||||
return text, rtl_dir
|
||||
Binary file not shown.
@ -0,0 +1,16 @@
|
||||
"""Layout mode text extraction extension for pypdf"""
|
||||
from ._fixed_width_page import (
|
||||
fixed_char_width,
|
||||
fixed_width_page,
|
||||
text_show_operations,
|
||||
y_coordinate_groups,
|
||||
)
|
||||
from ._font import Font
|
||||
|
||||
__all__ = [
|
||||
"Font",
|
||||
"fixed_char_width",
|
||||
"fixed_width_page",
|
||||
"text_show_operations",
|
||||
"y_coordinate_groups",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,394 @@
|
||||
"""Extract PDF text preserving the layout of the source PDF"""
|
||||
|
||||
from itertools import groupby
|
||||
from math import ceil
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, TypedDict
|
||||
|
||||
from ..._utils import logger_warning
|
||||
from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
|
||||
from ._font import Font
|
||||
from ._text_state_manager import TextStateManager
|
||||
from ._text_state_params import TextStateParams
|
||||
|
||||
|
||||
class BTGroup(TypedDict):
|
||||
"""
|
||||
Dict describing a line of text rendered within a BT/ET operator pair.
|
||||
If multiple text show operations render text on the same line, the text
|
||||
will be combined into a single BTGroup dict.
|
||||
|
||||
Keys:
|
||||
tx: x coordinate of first character in BTGroup
|
||||
ty: y coordinate of first character in BTGroup
|
||||
font_size: nominal font size
|
||||
font_height: effective font height
|
||||
text: rendered text
|
||||
displaced_tx: x coordinate of last character in BTGroup
|
||||
flip_sort: -1 if page is upside down, else 1
|
||||
"""
|
||||
|
||||
tx: float
|
||||
ty: float
|
||||
font_size: float
|
||||
font_height: float
|
||||
text: str
|
||||
displaced_tx: float
|
||||
flip_sort: Literal[-1, 1]
|
||||
|
||||
|
||||
def bt_group(tj_op: TextStateParams, rendered_text: str, dispaced_tx: float) -> BTGroup:
|
||||
"""
|
||||
BTGroup constructed from a TextStateParams instance, rendered text, and
|
||||
displaced tx value.
|
||||
|
||||
Args:
|
||||
tj_op (TextStateParams): TextStateParams instance
|
||||
rendered_text (str): rendered text
|
||||
dispaced_tx (float): x coordinate of last character in BTGroup
|
||||
|
||||
"""
|
||||
return BTGroup(
|
||||
tx=tj_op.tx,
|
||||
ty=tj_op.ty,
|
||||
font_size=tj_op.font_size,
|
||||
font_height=tj_op.font_height,
|
||||
text=rendered_text,
|
||||
displaced_tx=dispaced_tx,
|
||||
flip_sort=-1 if tj_op.flip_vertical else 1,
|
||||
)
|
||||
|
||||
|
||||
def recurs_to_target_op(
|
||||
ops: Iterator[Tuple[List[Any], bytes]],
|
||||
text_state_mgr: TextStateManager,
|
||||
end_target: Literal[b"Q", b"ET"],
|
||||
fonts: Dict[str, Font],
|
||||
strip_rotated: bool = True,
|
||||
) -> Tuple[List[BTGroup], List[TextStateParams]]:
|
||||
"""
|
||||
Recurse operators between BT/ET and/or q/Q operators managing the transform
|
||||
stack and capturing text positioning and rendering data.
|
||||
|
||||
Args:
|
||||
ops: iterator of operators in content stream
|
||||
text_state_mgr: a TextStateManager instance
|
||||
end_target: Either b"Q" (ends b"q" op) or b"ET" (ends b"BT" op)
|
||||
fonts: font dictionary as returned by PageObject._layout_mode_fonts()
|
||||
|
||||
Returns:
|
||||
tuple: list of BTGroup dicts + list of TextStateParams dataclass instances.
|
||||
|
||||
"""
|
||||
# 1 entry per line of text rendered within each BT/ET operation.
|
||||
bt_groups: List[BTGroup] = []
|
||||
|
||||
# 1 entry per text show operator (Tj/TJ/'/")
|
||||
tj_ops: List[TextStateParams] = []
|
||||
|
||||
if end_target == b"Q":
|
||||
# add new q level. cm's added at this level will be popped at next b'Q'
|
||||
text_state_mgr.add_q()
|
||||
|
||||
while True:
|
||||
try:
|
||||
operands, op = next(ops)
|
||||
except StopIteration:
|
||||
return bt_groups, tj_ops
|
||||
if op == end_target:
|
||||
if op == b"Q":
|
||||
text_state_mgr.remove_q()
|
||||
if op == b"ET":
|
||||
if not tj_ops:
|
||||
return bt_groups, tj_ops
|
||||
_text = ""
|
||||
bt_idx = 0 # idx of first tj in this bt group
|
||||
last_displaced_tx = tj_ops[bt_idx].displaced_tx
|
||||
last_ty = tj_ops[bt_idx].ty
|
||||
for _idx, _tj in enumerate(
|
||||
tj_ops
|
||||
): # ... build text from new Tj operators
|
||||
if strip_rotated and _tj.rotated:
|
||||
continue
|
||||
if not _tj.font.interpretable: # generates warning
|
||||
continue
|
||||
# if the y position of the text is greater than the font height, assume
|
||||
# the text is on a new line and start a new group
|
||||
if abs(_tj.ty - last_ty) > _tj.font_height:
|
||||
if _text.strip():
|
||||
bt_groups.append(
|
||||
bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
|
||||
)
|
||||
bt_idx = _idx
|
||||
_text = ""
|
||||
|
||||
# if the x position of the text is less than the last x position by
|
||||
# more than 5 spaces widths, assume the text order should be flipped
|
||||
# and start a new group
|
||||
if (
|
||||
last_displaced_tx - _tj.tx
|
||||
> _tj.space_tx * LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
|
||||
):
|
||||
if _text.strip():
|
||||
bt_groups.append(
|
||||
bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
|
||||
)
|
||||
bt_idx = _idx
|
||||
last_displaced_tx = _tj.displaced_tx
|
||||
_text = ""
|
||||
|
||||
# calculate excess x translation based on ending tx of previous Tj.
|
||||
# multiply by bool (_idx != bt_idx) to ensure spaces aren't double
|
||||
# applied to the first tj of a BTGroup in fixed_width_page().
|
||||
excess_tx = round(_tj.tx - last_displaced_tx, 3) * (_idx != bt_idx)
|
||||
# space_tx could be 0 if either Tz or font_size was 0 for this _tj.
|
||||
spaces = int(excess_tx // _tj.space_tx) if _tj.space_tx else 0
|
||||
new_text = f'{" " * spaces}{_tj.txt}'
|
||||
|
||||
last_ty = _tj.ty
|
||||
_text = f"{_text}{new_text}"
|
||||
last_displaced_tx = _tj.displaced_tx
|
||||
if _text:
|
||||
bt_groups.append(bt_group(tj_ops[bt_idx], _text, last_displaced_tx))
|
||||
text_state_mgr.reset_tm()
|
||||
return bt_groups, tj_ops
|
||||
if op == b"q":
|
||||
bts, tjs = recurs_to_target_op(
|
||||
ops, text_state_mgr, b"Q", fonts, strip_rotated
|
||||
)
|
||||
bt_groups.extend(bts)
|
||||
tj_ops.extend(tjs)
|
||||
elif op == b"cm":
|
||||
text_state_mgr.add_cm(*operands)
|
||||
elif op == b"BT":
|
||||
bts, tjs = recurs_to_target_op(
|
||||
ops, text_state_mgr, b"ET", fonts, strip_rotated
|
||||
)
|
||||
bt_groups.extend(bts)
|
||||
tj_ops.extend(tjs)
|
||||
elif op == b"Tj":
|
||||
tj_ops.append(text_state_mgr.text_state_params(operands[0]))
|
||||
elif op == b"TJ":
|
||||
_tj = text_state_mgr.text_state_params()
|
||||
for tj_op in operands[0]:
|
||||
if isinstance(tj_op, bytes):
|
||||
_tj = text_state_mgr.text_state_params(tj_op)
|
||||
tj_ops.append(_tj)
|
||||
else:
|
||||
text_state_mgr.add_trm(_tj.displacement_matrix(TD_offset=tj_op))
|
||||
elif op == b"'":
|
||||
text_state_mgr.reset_trm()
|
||||
text_state_mgr.add_tm([0, -text_state_mgr.TL])
|
||||
tj_ops.append(text_state_mgr.text_state_params(operands[0]))
|
||||
elif op == b'"':
|
||||
text_state_mgr.reset_trm()
|
||||
text_state_mgr.set_state_param(b"Tw", operands[0])
|
||||
text_state_mgr.set_state_param(b"Tc", operands[1])
|
||||
text_state_mgr.add_tm([0, -text_state_mgr.TL])
|
||||
tj_ops.append(text_state_mgr.text_state_params(operands[2]))
|
||||
elif op in (b"Td", b"Tm", b"TD", b"T*"):
|
||||
text_state_mgr.reset_trm()
|
||||
if op == b"Tm":
|
||||
text_state_mgr.reset_tm()
|
||||
elif op == b"TD":
|
||||
text_state_mgr.set_state_param(b"TL", -operands[1])
|
||||
elif op == b"T*":
|
||||
operands = [0, -text_state_mgr.TL]
|
||||
text_state_mgr.add_tm(operands)
|
||||
elif op == b"Tf":
|
||||
text_state_mgr.set_font(fonts[operands[0]], operands[1])
|
||||
else: # handle Tc, Tw, Tz, TL, and Ts operators
|
||||
text_state_mgr.set_state_param(op, operands)
|
||||
|
||||
|
||||
def y_coordinate_groups(
|
||||
bt_groups: List[BTGroup], debug_path: Optional[Path] = None
|
||||
) -> Dict[int, List[BTGroup]]:
|
||||
"""
|
||||
Group text operations by rendered y coordinate, i.e. the line number.
|
||||
|
||||
Args:
|
||||
bt_groups: list of dicts as returned by text_show_operations()
|
||||
debug_path (Path, optional): Path to a directory for saving debug output.
|
||||
|
||||
Returns:
|
||||
Dict[int, List[BTGroup]]: dict of lists of text rendered by each BT operator
|
||||
keyed by y coordinate
|
||||
|
||||
"""
|
||||
ty_groups = {
|
||||
ty: sorted(grp, key=lambda x: x["tx"])
|
||||
for ty, grp in groupby(
|
||||
bt_groups, key=lambda bt_grp: int(bt_grp["ty"] * bt_grp["flip_sort"])
|
||||
)
|
||||
}
|
||||
# combine groups whose y coordinates differ by less than the effective font height
|
||||
# (accounts for mixed fonts and other minor oddities)
|
||||
last_ty = next(iter(ty_groups))
|
||||
last_txs = {int(_t["tx"]) for _t in ty_groups[last_ty] if _t["text"].strip()}
|
||||
for ty in list(ty_groups)[1:]:
|
||||
fsz = min(ty_groups[_y][0]["font_height"] for _y in (ty, last_ty))
|
||||
txs = {int(_t["tx"]) for _t in ty_groups[ty] if _t["text"].strip()}
|
||||
# prevent merge if both groups are rendering in the same x position.
|
||||
no_text_overlap = not (txs & last_txs)
|
||||
offset_less_than_font_height = abs(ty - last_ty) < fsz
|
||||
if no_text_overlap and offset_less_than_font_height:
|
||||
ty_groups[last_ty] = sorted(
|
||||
ty_groups.pop(ty) + ty_groups[last_ty], key=lambda x: x["tx"]
|
||||
)
|
||||
last_txs |= txs
|
||||
else:
|
||||
last_ty = ty
|
||||
last_txs = txs
|
||||
if debug_path: # pragma: no cover
|
||||
import json
|
||||
|
||||
debug_path.joinpath("bt_groups.json").write_text(
|
||||
json.dumps(ty_groups, indent=2, default=str), "utf-8"
|
||||
)
|
||||
return ty_groups
|
||||
|
||||
|
||||
def text_show_operations(
|
||||
ops: Iterator[Tuple[List[Any], bytes]],
|
||||
fonts: Dict[str, Font],
|
||||
strip_rotated: bool = True,
|
||||
debug_path: Optional[Path] = None,
|
||||
) -> List[BTGroup]:
|
||||
"""
|
||||
Extract text from BT/ET operator pairs.
|
||||
|
||||
Args:
|
||||
ops (Iterator[Tuple[List, bytes]]): iterator of operators in content stream
|
||||
fonts (Dict[str, Font]): font dictionary
|
||||
strip_rotated: Removes text if rotated w.r.t. to the page. Defaults to True.
|
||||
debug_path (Path, optional): Path to a directory for saving debug output.
|
||||
|
||||
Returns:
|
||||
List[BTGroup]: list of dicts of text rendered by each BT operator
|
||||
|
||||
"""
|
||||
state_mgr = TextStateManager() # transformation stack manager
|
||||
debug = bool(debug_path)
|
||||
bt_groups: List[BTGroup] = [] # BT operator dict
|
||||
tj_debug: List[TextStateParams] = [] # Tj/TJ operator data (debug only)
|
||||
try:
|
||||
warned_rotation = False
|
||||
warned_uninterpretable_font = False
|
||||
while True:
|
||||
operands, op = next(ops)
|
||||
if op in (b"BT", b"q"):
|
||||
bts, tjs = recurs_to_target_op(
|
||||
ops, state_mgr, b"ET" if op == b"BT" else b"Q", fonts, strip_rotated
|
||||
)
|
||||
if not warned_rotation and any(tj.rotated for tj in tjs):
|
||||
warned_rotation = True
|
||||
if strip_rotated:
|
||||
logger_warning(
|
||||
"Rotated text discovered. Output will be incomplete.",
|
||||
__name__,
|
||||
)
|
||||
else:
|
||||
logger_warning(
|
||||
"Rotated text discovered. Layout will be degraded.",
|
||||
__name__,
|
||||
)
|
||||
if not warned_uninterpretable_font and any(not tj.font.interpretable for tj in tjs):
|
||||
warned_uninterpretable_font = True
|
||||
logger_warning(
|
||||
"PDF contains an uninterpretable font. Output will be incomplete.",
|
||||
__name__,
|
||||
)
|
||||
bt_groups.extend(bts)
|
||||
if debug: # pragma: no cover
|
||||
tj_debug.extend(tjs)
|
||||
elif op == b"Tf":
|
||||
state_mgr.set_font(fonts[operands[0]], operands[1])
|
||||
else: # set Tc, Tw, Tz, TL, and Ts if required. ignores all other ops
|
||||
state_mgr.set_state_param(op, operands)
|
||||
except StopIteration:
|
||||
pass
|
||||
|
||||
# left align the data, i.e. decrement all tx values by min(tx)
|
||||
min_x = min((x["tx"] for x in bt_groups), default=0.0)
|
||||
bt_groups = [
|
||||
dict(ogrp, tx=ogrp["tx"] - min_x, displaced_tx=ogrp["displaced_tx"] - min_x) # type: ignore[misc]
|
||||
for ogrp in sorted(
|
||||
bt_groups, key=lambda x: (x["ty"] * x["flip_sort"], -x["tx"]), reverse=True
|
||||
)
|
||||
]
|
||||
|
||||
if debug_path: # pragma: no cover
|
||||
import json
|
||||
|
||||
debug_path.joinpath("bts.json").write_text(
|
||||
json.dumps(bt_groups, indent=2, default=str), "utf-8"
|
||||
)
|
||||
debug_path.joinpath("tjs.json").write_text(
|
||||
json.dumps(
|
||||
tj_debug, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
|
||||
),
|
||||
"utf-8",
|
||||
)
|
||||
return bt_groups
|
||||
|
||||
|
||||
def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> float:
|
||||
"""
|
||||
Calculate average character width weighted by the length of the rendered
|
||||
text in each sample for conversion to fixed-width layout.
|
||||
|
||||
Args:
|
||||
bt_groups (List[BTGroup]): List of dicts of text rendered by each
|
||||
BT operator
|
||||
|
||||
Returns:
|
||||
float: fixed character width
|
||||
|
||||
"""
|
||||
char_widths = []
|
||||
for _bt in bt_groups:
|
||||
_len = len(_bt["text"]) * scale_weight
|
||||
char_widths.append(((_bt["displaced_tx"] - _bt["tx"]) / _len, _len))
|
||||
return sum(_w * _l for _w, _l in char_widths) / sum(_l for _, _l in char_widths)
|
||||
|
||||
|
||||
def fixed_width_page(
|
||||
ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float
|
||||
) -> str:
|
||||
"""
|
||||
Generate page text from text operations grouped by rendered y coordinate.
|
||||
|
||||
Args:
|
||||
ty_groups: dict of text show ops as returned by y_coordinate_groups()
|
||||
char_width: fixed character width
|
||||
space_vertically: include blank lines inferred from y distance + font height.
|
||||
font_height_weight: multiplier for font height when calculating blank lines.
|
||||
|
||||
Returns:
|
||||
str: page text in a fixed width format that closely adheres to the rendered
|
||||
layout in the source pdf.
|
||||
|
||||
"""
|
||||
lines: List[str] = []
|
||||
last_y_coord = 0
|
||||
for y_coord, line_data in ty_groups.items():
|
||||
if space_vertically and lines:
|
||||
fh = line_data[0]["font_height"]
|
||||
blank_lines = 0 if fh == 0 else (
|
||||
int(abs(y_coord - last_y_coord) / (fh * font_height_weight)) - 1
|
||||
)
|
||||
lines.extend([""] * blank_lines)
|
||||
line = ""
|
||||
last_disp = 0.0
|
||||
for bt_op in line_data:
|
||||
offset = int(bt_op["tx"] // char_width)
|
||||
spaces = (offset - len(line)) * (ceil(last_disp) < int(bt_op["tx"]))
|
||||
line = f"{line}{' ' * spaces}{bt_op['text']}"
|
||||
last_disp = bt_op["displaced_tx"]
|
||||
if line.strip() or lines:
|
||||
lines.append(
|
||||
"".join(c if ord(c) < 14 or ord(c) > 31 else " " for c in line)
|
||||
)
|
||||
last_y_coord = y_coord
|
||||
return "\n".join(ln.rstrip() for ln in lines if space_vertically or ln.strip())
|
||||
@ -0,0 +1,152 @@
|
||||
"""Font constants and classes for "layout" mode text operations"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, Sequence, Union, cast
|
||||
|
||||
from ..._codecs import adobe_glyphs
|
||||
from ...errors import ParseError
|
||||
from ...generic import IndirectObject
|
||||
from ._font_widths import STANDARD_WIDTHS
|
||||
|
||||
|
||||
@dataclass
|
||||
class Font:
|
||||
"""
|
||||
A font object formatted for use during "layout" mode text extraction
|
||||
|
||||
Attributes:
|
||||
subtype (str): font subtype
|
||||
space_width (int | float): width of a space character
|
||||
encoding (str | Dict[int, str]): font encoding
|
||||
char_map (dict): character map
|
||||
font_dictionary (dict): font dictionary
|
||||
width_map (Dict[str, int]): mapping of characters to widths
|
||||
interpretable (bool): Default True. If False, the font glyphs cannot
|
||||
be translated to characters, e.g. Type3 fonts that do not define
|
||||
a '/ToUnicode' mapping.
|
||||
|
||||
"""
|
||||
|
||||
subtype: str
|
||||
space_width: Union[int, float]
|
||||
encoding: Union[str, Dict[int, str]]
|
||||
char_map: Dict[Any, Any]
|
||||
font_dictionary: Dict[Any, Any]
|
||||
width_map: Dict[str, int] = field(default_factory=dict, init=False)
|
||||
interpretable: bool = True
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
# Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
|
||||
# reliably converted into character codes unless all named chars
|
||||
# in /CharProcs map to a standard adobe glyph. See § 9.10.2 of the
|
||||
# PDF 1.7 standard.
|
||||
if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary:
|
||||
self.interpretable = all(
|
||||
cname in adobe_glyphs
|
||||
for cname in self.font_dictionary.get("/CharProcs") or []
|
||||
)
|
||||
|
||||
if not self.interpretable: # save some overhead if font is not interpretable
|
||||
return
|
||||
|
||||
# TrueType fonts have a /Widths array mapping character codes to widths
|
||||
if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:
|
||||
first_char = self.font_dictionary.get("/FirstChar", 0)
|
||||
self.width_map = {
|
||||
self.encoding.get(idx + first_char, chr(idx + first_char)): width
|
||||
for idx, width in enumerate(self.font_dictionary["/Widths"])
|
||||
}
|
||||
|
||||
# CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts
|
||||
if "/DescendantFonts" in self.font_dictionary:
|
||||
d_font: Dict[Any, Any]
|
||||
for d_font_idx, d_font in enumerate(
|
||||
self.font_dictionary["/DescendantFonts"]
|
||||
):
|
||||
while isinstance(d_font, IndirectObject):
|
||||
d_font = d_font.get_object()
|
||||
self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font
|
||||
ord_map = {
|
||||
ord(_target): _surrogate
|
||||
for _target, _surrogate in self.char_map.items()
|
||||
if isinstance(_target, str)
|
||||
}
|
||||
# /W width definitions have two valid formats which can be mixed and matched:
|
||||
# (1) A character start index followed by a list of widths, e.g.
|
||||
# `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
|
||||
# (2) A character start index, a character stop index, and a width, e.g.
|
||||
# `45 65 500` applies width 500 to characters 45-65.
|
||||
skip_count = 0
|
||||
_w = d_font.get("/W", [])
|
||||
for idx, w_entry in enumerate(_w):
|
||||
w_entry = w_entry.get_object()
|
||||
if skip_count:
|
||||
skip_count -= 1
|
||||
continue
|
||||
if not isinstance(w_entry, (int, float)): # pragma: no cover
|
||||
# We should never get here due to skip_count above. Add a
|
||||
# warning and or use reader's "strict" to force an ex???
|
||||
continue
|
||||
# check for format (1): `int [int int int int ...]`
|
||||
w_next_entry = _w[idx + 1].get_object()
|
||||
if isinstance(w_next_entry, Sequence):
|
||||
start_idx, width_list = w_entry, w_next_entry
|
||||
self.width_map.update(
|
||||
{
|
||||
ord_map[_cidx]: _width
|
||||
for _cidx, _width in zip(
|
||||
range(
|
||||
cast(int, start_idx),
|
||||
cast(int, start_idx) + len(width_list),
|
||||
1,
|
||||
),
|
||||
width_list,
|
||||
)
|
||||
if _cidx in ord_map
|
||||
}
|
||||
)
|
||||
skip_count = 1
|
||||
# check for format (2): `int int int`
|
||||
elif isinstance(w_next_entry, (int, float)) and isinstance(
|
||||
_w[idx + 2].get_object(), (int, float)
|
||||
):
|
||||
start_idx, stop_idx, const_width = (
|
||||
w_entry,
|
||||
w_next_entry,
|
||||
_w[idx + 2].get_object(),
|
||||
)
|
||||
self.width_map.update(
|
||||
{
|
||||
ord_map[_cidx]: const_width
|
||||
for _cidx in range(
|
||||
cast(int, start_idx), cast(int, stop_idx + 1), 1
|
||||
)
|
||||
if _cidx in ord_map
|
||||
}
|
||||
)
|
||||
skip_count = 2
|
||||
else:
|
||||
# Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions
|
||||
# while expecting more elements). This raises an IndexError which is sufficient.
|
||||
raise ParseError(
|
||||
f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"
|
||||
) # pragma: no cover
|
||||
|
||||
if not self.width_map and "/BaseFont" in self.font_dictionary:
|
||||
for key in STANDARD_WIDTHS:
|
||||
if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):
|
||||
self.width_map = STANDARD_WIDTHS[key]
|
||||
break
|
||||
|
||||
def word_width(self, word: str) -> float:
|
||||
"""Sum of character widths specified in PDF font for the supplied word"""
|
||||
return sum(
|
||||
[self.width_map.get(char, self.space_width * 2) for char in word], 0.0
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def to_dict(font_instance: "Font") -> Dict[str, Any]:
|
||||
"""Dataclass to dict for json.dumps serialization."""
|
||||
return {
|
||||
k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__
|
||||
}
|
||||
@ -0,0 +1,208 @@
|
||||
# Widths for the standard 14 fonts as described on page 416 of the PDF 1.7 standard
|
||||
STANDARD_WIDTHS = {
|
||||
"Helvetica": { # 4 fonts, includes bold, oblique and boldoblique variants
|
||||
" ": 278,
|
||||
"!": 278,
|
||||
'"': 355,
|
||||
"#": 556,
|
||||
"$": 556,
|
||||
"%": 889,
|
||||
"&": 667,
|
||||
"'": 191,
|
||||
"(": 333,
|
||||
")": 333,
|
||||
"*": 389,
|
||||
"+": 584,
|
||||
",": 278,
|
||||
"-": 333,
|
||||
".": 278,
|
||||
"/": 278,
|
||||
"0": 556,
|
||||
"1": 556,
|
||||
"2": 556,
|
||||
"3": 556,
|
||||
"4": 556,
|
||||
"5": 556,
|
||||
"6": 556,
|
||||
"7": 556,
|
||||
"8": 556,
|
||||
"9": 556,
|
||||
":": 278,
|
||||
";": 278,
|
||||
"<": 584,
|
||||
"=": 584,
|
||||
">": 584,
|
||||
"?": 611,
|
||||
"@": 975,
|
||||
"A": 667,
|
||||
"B": 667,
|
||||
"C": 722,
|
||||
"D": 722,
|
||||
"E": 667,
|
||||
"F": 611,
|
||||
"G": 778,
|
||||
"H": 722,
|
||||
"I": 278,
|
||||
"J": 500,
|
||||
"K": 667,
|
||||
"L": 556,
|
||||
"M": 833,
|
||||
"N": 722,
|
||||
"O": 778,
|
||||
"P": 667,
|
||||
"Q": 944,
|
||||
"R": 667,
|
||||
"S": 667,
|
||||
"T": 611,
|
||||
"U": 278,
|
||||
"V": 278,
|
||||
"W": 584,
|
||||
"X": 556,
|
||||
"Y": 556,
|
||||
"Z": 500,
|
||||
"[": 556,
|
||||
"\\": 556,
|
||||
"]": 556,
|
||||
"^": 278,
|
||||
"_": 278,
|
||||
"`": 278,
|
||||
"a": 278,
|
||||
"b": 278,
|
||||
"c": 333,
|
||||
"d": 556,
|
||||
"e": 556,
|
||||
"f": 556,
|
||||
"g": 556,
|
||||
"h": 556,
|
||||
"i": 556,
|
||||
"j": 556,
|
||||
"k": 556,
|
||||
"l": 556,
|
||||
"m": 556,
|
||||
"n": 278,
|
||||
"o": 278,
|
||||
"p": 556,
|
||||
"q": 556,
|
||||
"r": 500,
|
||||
"s": 556,
|
||||
"t": 556,
|
||||
"u": 278,
|
||||
"v": 500,
|
||||
"w": 500,
|
||||
"x": 222,
|
||||
"y": 222,
|
||||
"z": 556,
|
||||
"{": 222,
|
||||
"|": 833,
|
||||
"}": 556,
|
||||
"~": 556,
|
||||
},
|
||||
"Times": { # 4 fonts, includes bold, oblique and boldoblique variants
|
||||
" ": 250,
|
||||
"!": 333,
|
||||
'"': 408,
|
||||
"#": 500,
|
||||
"$": 500,
|
||||
"%": 833,
|
||||
"&": 778,
|
||||
"'": 180,
|
||||
"(": 333,
|
||||
")": 333,
|
||||
"*": 500,
|
||||
"+": 564,
|
||||
",": 250,
|
||||
"-": 333,
|
||||
".": 250,
|
||||
"/": 564,
|
||||
"0": 500,
|
||||
"1": 500,
|
||||
"2": 500,
|
||||
"3": 500,
|
||||
"4": 500,
|
||||
"5": 500,
|
||||
"6": 500,
|
||||
"7": 500,
|
||||
"8": 500,
|
||||
"9": 500,
|
||||
":": 278,
|
||||
";": 278,
|
||||
"<": 564,
|
||||
"=": 564,
|
||||
">": 564,
|
||||
"?": 444,
|
||||
"@": 921,
|
||||
"A": 722,
|
||||
"B": 667,
|
||||
"C": 667,
|
||||
"D": 722,
|
||||
"E": 611,
|
||||
"F": 556,
|
||||
"G": 722,
|
||||
"H": 722,
|
||||
"I": 333,
|
||||
"J": 389,
|
||||
"K": 722,
|
||||
"L": 611,
|
||||
"M": 889,
|
||||
"N": 722,
|
||||
"O": 722,
|
||||
"P": 556,
|
||||
"Q": 722,
|
||||
"R": 667,
|
||||
"S": 556,
|
||||
"T": 611,
|
||||
"U": 722,
|
||||
"V": 722,
|
||||
"W": 944,
|
||||
"X": 722,
|
||||
"Y": 722,
|
||||
"Z": 611,
|
||||
"[": 333,
|
||||
"\\": 278,
|
||||
"]": 333,
|
||||
"^": 469,
|
||||
"_": 500,
|
||||
"`": 333,
|
||||
"a": 444,
|
||||
"b": 500,
|
||||
"c": 444,
|
||||
"d": 500,
|
||||
"e": 444,
|
||||
"f": 333,
|
||||
"g": 500,
|
||||
"h": 500,
|
||||
"i": 278,
|
||||
"j": 278,
|
||||
"k": 500,
|
||||
"l": 278,
|
||||
"m": 722,
|
||||
"n": 500,
|
||||
"o": 500,
|
||||
"p": 500,
|
||||
"q": 500,
|
||||
"r": 333,
|
||||
"s": 389,
|
||||
"t": 278,
|
||||
"u": 500,
|
||||
"v": 444,
|
||||
"w": 722,
|
||||
"x": 500,
|
||||
"y": 444,
|
||||
"z": 389,
|
||||
"{": 348,
|
||||
"|": 220,
|
||||
"}": 348,
|
||||
"~": 469,
|
||||
},
|
||||
}
|
||||
|
||||
# 4 fonts, includes bold, oblique and bold oblique variants
|
||||
STANDARD_WIDTHS[
|
||||
"Courier"
|
||||
] = dict.fromkeys(STANDARD_WIDTHS["Times"], 600) # fixed width
|
||||
STANDARD_WIDTHS["ZapfDingbats"] = dict.fromkeys(STANDARD_WIDTHS["Times"], 1000) # 1 font
|
||||
STANDARD_WIDTHS["Symbol"] = dict.fromkeys(STANDARD_WIDTHS["Times"], 500) # 1 font
|
||||
# add aliases per table H.3 on page 1110 of the PDF 1.7 standard
|
||||
STANDARD_WIDTHS["CourierNew"] = STANDARD_WIDTHS["Courier"]
|
||||
STANDARD_WIDTHS["Arial"] = STANDARD_WIDTHS["Helvetica"]
|
||||
STANDARD_WIDTHS["TimesNewRoman"] = STANDARD_WIDTHS["Times"]
|
||||
@ -0,0 +1,217 @@
|
||||
"""manage the PDF transform stack during "layout" mode text extraction"""
|
||||
|
||||
from collections import ChainMap, Counter
|
||||
from typing import Any, Dict, List, MutableMapping, Union
|
||||
from typing import ChainMap as ChainMapType
|
||||
from typing import Counter as CounterType
|
||||
|
||||
from ...errors import PdfReadError
|
||||
from .. import mult
|
||||
from ._font import Font
|
||||
from ._text_state_params import TextStateParams
|
||||
|
||||
TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]]
|
||||
TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]]
|
||||
|
||||
|
||||
class TextStateManager:
|
||||
"""
|
||||
Tracks the current text state including cm/tm/trm transformation matrices.
|
||||
|
||||
Attributes:
|
||||
transform_stack (ChainMap): ChainMap of cm/tm transformation matrices
|
||||
q_queue (Counter[int]): Counter of q operators
|
||||
q_depth (List[int]): list of q operator nesting levels
|
||||
Tc (float): character spacing
|
||||
Tw (float): word spacing
|
||||
Tz (int): horizontal scaling
|
||||
TL (float): leading
|
||||
Ts (float): text rise
|
||||
font (Font): font object
|
||||
font_size (int | float): font size
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.transform_stack: TextStateManagerChainMapType = ChainMap(
|
||||
self.new_transform()
|
||||
)
|
||||
self.q_queue: CounterType[int] = Counter()
|
||||
self.q_depth = [0]
|
||||
self.Tc: float = 0.0
|
||||
self.Tw: float = 0.0
|
||||
self.Tz: float = 100.0
|
||||
self.TL: float = 0.0
|
||||
self.Ts: float = 0.0
|
||||
self.font: Union[Font, None] = None
|
||||
self.font_size: Union[int, float] = 0
|
||||
|
||||
def set_state_param(self, op: bytes, value: Union[float, List[Any]]) -> None:
|
||||
"""
|
||||
Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators.
|
||||
|
||||
Args:
|
||||
op: operator read from PDF stream as bytes. No action is taken
|
||||
for unsupported operators (see supported operators above).
|
||||
value (float | List[Any]): new parameter value. If a list,
|
||||
value[0] is used.
|
||||
|
||||
"""
|
||||
if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]:
|
||||
return
|
||||
self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value)
|
||||
|
||||
def set_font(self, font: Font, size: float) -> None:
|
||||
"""
|
||||
Set the current font and font_size.
|
||||
|
||||
Args:
|
||||
font (Font): a layout mode Font
|
||||
size (float): font size
|
||||
|
||||
"""
|
||||
self.font = font
|
||||
self.font_size = size
|
||||
|
||||
def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams:
|
||||
"""
|
||||
Create a TextStateParams instance to display a text string. Type[bytes] values
|
||||
will be decoded implicitly.
|
||||
|
||||
Args:
|
||||
value (str | bytes): text to associate with the captured state.
|
||||
|
||||
Raises:
|
||||
PdfReadError: if font not set (no Tf operator in incoming pdf content stream)
|
||||
|
||||
Returns:
|
||||
TextStateParams: current text state parameters
|
||||
|
||||
"""
|
||||
if not isinstance(self.font, Font):
|
||||
raise PdfReadError(
|
||||
"font not set: is PDF missing a Tf operator?"
|
||||
) # pragma: no cover
|
||||
if isinstance(value, bytes):
|
||||
try:
|
||||
if isinstance(self.font.encoding, str):
|
||||
txt = value.decode(self.font.encoding, "surrogatepass")
|
||||
else:
|
||||
txt = "".join(
|
||||
self.font.encoding[x]
|
||||
if x in self.font.encoding
|
||||
else bytes((x,)).decode()
|
||||
for x in value
|
||||
)
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
txt = value.decode("utf-8", "replace")
|
||||
txt = "".join(
|
||||
self.font.char_map.get(x, x) for x in txt
|
||||
)
|
||||
else:
|
||||
txt = value
|
||||
return TextStateParams(
|
||||
txt,
|
||||
self.font,
|
||||
self.font_size,
|
||||
self.Tc,
|
||||
self.Tw,
|
||||
self.Tz,
|
||||
self.TL,
|
||||
self.Ts,
|
||||
self.effective_transform,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def raw_transform(
|
||||
_a: float = 1.0,
|
||||
_b: float = 0.0,
|
||||
_c: float = 0.0,
|
||||
_d: float = 1.0,
|
||||
_e: float = 0.0,
|
||||
_f: float = 0.0,
|
||||
) -> Dict[int, float]:
|
||||
"""Only a/b/c/d/e/f matrix params"""
|
||||
return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f))))
|
||||
|
||||
@staticmethod
|
||||
def new_transform(
|
||||
_a: float = 1.0,
|
||||
_b: float = 0.0,
|
||||
_c: float = 0.0,
|
||||
_d: float = 1.0,
|
||||
_e: float = 0.0,
|
||||
_f: float = 0.0,
|
||||
is_text: bool = False,
|
||||
is_render: bool = False,
|
||||
) -> TextStateManagerDictType:
|
||||
"""Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys"""
|
||||
result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f)
|
||||
result.update({"is_text": is_text, "is_render": is_render})
|
||||
return result
|
||||
|
||||
def reset_tm(self) -> TextStateManagerChainMapType:
|
||||
"""Clear all transforms from chainmap having is_text==True or is_render==True"""
|
||||
while (
|
||||
self.transform_stack.maps[0]["is_text"]
|
||||
or self.transform_stack.maps[0]["is_render"]
|
||||
):
|
||||
self.transform_stack = self.transform_stack.parents
|
||||
return self.transform_stack
|
||||
|
||||
def reset_trm(self) -> TextStateManagerChainMapType:
|
||||
"""Clear all transforms from chainmap having is_render==True"""
|
||||
while self.transform_stack.maps[0]["is_render"]:
|
||||
self.transform_stack = self.transform_stack.parents
|
||||
return self.transform_stack
|
||||
|
||||
def remove_q(self) -> TextStateManagerChainMapType:
|
||||
"""Rewind to stack prior state after closing a 'q' with internal 'cm' ops"""
|
||||
self.transform_stack = self.reset_tm()
|
||||
self.transform_stack.maps = self.transform_stack.maps[
|
||||
self.q_queue.pop(self.q_depth.pop(), 0) :
|
||||
]
|
||||
return self.transform_stack
|
||||
|
||||
def add_q(self) -> None:
|
||||
"""Add another level to q_queue"""
|
||||
self.q_depth.append(len(self.q_depth))
|
||||
|
||||
def add_cm(self, *args: Any) -> TextStateManagerChainMapType:
|
||||
"""Concatenate an additional transform matrix"""
|
||||
self.transform_stack = self.reset_tm()
|
||||
self.q_queue.update(self.q_depth[-1:])
|
||||
self.transform_stack = self.transform_stack.new_child(self.new_transform(*args))
|
||||
return self.transform_stack
|
||||
|
||||
def _complete_matrix(self, operands: List[float]) -> List[float]:
|
||||
"""Adds a, b, c, and d to an "e/f only" operand set (e.g Td)"""
|
||||
if len(operands) == 2: # this is a Td operator or equivalent
|
||||
operands = [1.0, 0.0, 0.0, 1.0, *operands]
|
||||
return operands
|
||||
|
||||
def add_tm(self, operands: List[float]) -> TextStateManagerChainMapType:
|
||||
"""Append a text transform matrix"""
|
||||
self.transform_stack = self.transform_stack.new_child(
|
||||
self.new_transform( # type: ignore[misc]
|
||||
*self._complete_matrix(operands), is_text=True # type: ignore[arg-type]
|
||||
)
|
||||
)
|
||||
return self.transform_stack
|
||||
|
||||
def add_trm(self, operands: List[float]) -> TextStateManagerChainMapType:
|
||||
"""Append a text rendering transform matrix"""
|
||||
self.transform_stack = self.transform_stack.new_child(
|
||||
self.new_transform( # type: ignore[misc]
|
||||
*self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type]
|
||||
)
|
||||
)
|
||||
return self.transform_stack
|
||||
|
||||
@property
|
||||
def effective_transform(self) -> List[float]:
|
||||
"""Current effective transform accounting for cm, tm, and trm transforms"""
|
||||
eff_transform = [*self.transform_stack.maps[0].values()]
|
||||
for transform in self.transform_stack.maps[1:]:
|
||||
eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5
|
||||
return eff_transform
|
||||
@ -0,0 +1,129 @@
|
||||
"""A dataclass that captures the CTM and Text State for a tj operation"""
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
from .. import mult, orient
|
||||
from ._font import Font
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextStateParams:
|
||||
"""
|
||||
Text state parameters and operator values for a single text value in a
|
||||
TJ or Tj PDF operation.
|
||||
|
||||
Attributes:
|
||||
txt (str): the text to be rendered.
|
||||
font (Font): font object
|
||||
font_size (int | float): font size
|
||||
Tc (float): character spacing. Defaults to 0.0.
|
||||
Tw (float): word spacing. Defaults to 0.0.
|
||||
Tz (float): horizontal scaling. Defaults to 100.0.
|
||||
TL (float): leading, vertical displacement between text lines. Defaults to 0.0.
|
||||
Ts (float): text rise. Used for super/subscripts. Defaults to 0.0.
|
||||
transform (List[float]): effective transformation matrix.
|
||||
tx (float): x cood of rendered text, i.e. self.transform[4]
|
||||
ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts.
|
||||
displaced_tx (float): x coord immediately following rendered text
|
||||
space_tx (float): tx for a space character
|
||||
font_height (float): effective font height accounting for CTM
|
||||
flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.)
|
||||
rotated (bool): True if the text orientation is rotated with respect to the page.
|
||||
|
||||
"""
|
||||
|
||||
txt: str
|
||||
font: Font
|
||||
font_size: Union[int, float]
|
||||
Tc: float = 0.0
|
||||
Tw: float = 0.0
|
||||
Tz: float = 100.0
|
||||
TL: float = 0.0
|
||||
Ts: float = 0.0
|
||||
transform: List[float] = field(
|
||||
default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
|
||||
)
|
||||
tx: float = field(default=0.0, init=False)
|
||||
ty: float = field(default=0.0, init=False)
|
||||
displaced_tx: float = field(default=0.0, init=False)
|
||||
space_tx: float = field(default=0.0, init=False)
|
||||
font_height: float = field(default=0.0, init=False)
|
||||
flip_vertical: bool = field(default=False, init=False)
|
||||
rotated: bool = field(default=False, init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if orient(self.transform) in (90, 270):
|
||||
self.transform = mult(
|
||||
[1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0],
|
||||
self.transform,
|
||||
)
|
||||
self.rotated = True
|
||||
# self.transform[0] AND self.transform[3] < 0 indicates true rotation.
|
||||
# If only self.transform[3] < 0, the y coords are simply inverted.
|
||||
if orient(self.transform) == 180 and self.transform[0] < -1e-6:
|
||||
self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform)
|
||||
self.rotated = True
|
||||
self.displaced_tx = self.displaced_transform()[4]
|
||||
self.tx = self.transform[4]
|
||||
self.ty = self.render_transform()[5]
|
||||
self.space_tx = round(self.word_tx(" "), 3)
|
||||
if self.space_tx < 1e-6:
|
||||
# if the " " char is assigned 0 width (e.g. for fine tuned spacing
|
||||
# with TJ int operators a la crazyones.pdf), calculate space_tx as
|
||||
# a TD_offset of -2 * font.space_width where font.space_width is
|
||||
# the space_width calculated in _cmap.py.
|
||||
self.space_tx = round(self.word_tx("", self.font.space_width * -2), 3)
|
||||
self.font_height = self.font_size * math.sqrt(
|
||||
self.transform[1] ** 2 + self.transform[3] ** 2
|
||||
)
|
||||
# flip_vertical handles PDFs generated by Microsoft Word's "publish" command.
|
||||
self.flip_vertical = self.transform[3] < -1e-6 # inverts y axis
|
||||
|
||||
def font_size_matrix(self) -> List[float]:
|
||||
"""Font size matrix"""
|
||||
return [
|
||||
self.font_size * (self.Tz / 100.0),
|
||||
0.0,
|
||||
0.0,
|
||||
self.font_size,
|
||||
0.0,
|
||||
self.Ts,
|
||||
]
|
||||
|
||||
def displaced_transform(self) -> List[float]:
|
||||
"""Effective transform matrix after text has been rendered."""
|
||||
return mult(self.displacement_matrix(), self.transform)
|
||||
|
||||
def render_transform(self) -> List[float]:
|
||||
"""Effective transform matrix accounting for font size, Tz, and Ts."""
|
||||
return mult(self.font_size_matrix(), self.transform)
|
||||
|
||||
def displacement_matrix(
|
||||
self, word: Union[str, None] = None, TD_offset: float = 0.0
|
||||
) -> List[float]:
|
||||
"""
|
||||
Text displacement matrix
|
||||
|
||||
Args:
|
||||
word (str, optional): Defaults to None in which case self.txt displacement is
|
||||
returned.
|
||||
TD_offset (float, optional): translation applied by TD operator. Defaults to 0.0.
|
||||
|
||||
"""
|
||||
word = word if word is not None else self.txt
|
||||
return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, TD_offset), 0.0]
|
||||
|
||||
def word_tx(self, word: str, TD_offset: float = 0.0) -> float:
|
||||
"""Horizontal text displacement for any word according this text state"""
|
||||
return (
|
||||
(self.font_size * ((self.font.word_width(word) - TD_offset) / 1000.0))
|
||||
+ self.Tc
|
||||
+ word.count(" ") * self.Tw
|
||||
) * (self.Tz / 100.0)
|
||||
|
||||
@staticmethod
|
||||
def to_dict(inst: "TextStateParams") -> Dict[str, Any]:
|
||||
"""Dataclass to dict for json.dumps serialization"""
|
||||
return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}
|
||||
605
venv/lib/python3.12/site-packages/pypdf/_utils.py
Normal file
605
venv/lib/python3.12/site-packages/pypdf/_utils.py
Normal file
@ -0,0 +1,605 @@
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""Utility functions for PDF library."""
|
||||
__author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||
|
||||
import functools
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from io import DEFAULT_BUFFER_SIZE
|
||||
from os import SEEK_CUR
|
||||
from typing import (
|
||||
IO,
|
||||
Any,
|
||||
Dict,
|
||||
List,
|
||||
Optional,
|
||||
Pattern,
|
||||
Tuple,
|
||||
Union,
|
||||
overload,
|
||||
)
|
||||
|
||||
if sys.version_info[:2] >= (3, 10):
|
||||
# Python 3.10+: https://www.python.org/dev/peps/pep-0484/
|
||||
from typing import TypeAlias
|
||||
else:
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
if sys.version_info >= (3, 11):
|
||||
from typing import Self
|
||||
else:
|
||||
from typing_extensions import Self
|
||||
|
||||
from .errors import (
|
||||
STREAM_TRUNCATED_PREMATURELY,
|
||||
DeprecationError,
|
||||
PdfStreamError,
|
||||
)
|
||||
|
||||
TransformationMatrixType: TypeAlias = Tuple[
|
||||
Tuple[float, float, float], Tuple[float, float, float], Tuple[float, float, float]
|
||||
]
|
||||
CompressedTransformationMatrix: TypeAlias = Tuple[
|
||||
float, float, float, float, float, float
|
||||
]
|
||||
|
||||
StreamType = IO[Any]
|
||||
StrByteType = Union[str, StreamType]
|
||||
|
||||
|
||||
def parse_iso8824_date(text: Optional[str]) -> Optional[datetime]:
|
||||
orgtext = text
|
||||
if text is None:
|
||||
return None
|
||||
if text[0].isdigit():
|
||||
text = "D:" + text
|
||||
if text.endswith(("Z", "z")):
|
||||
text += "0000"
|
||||
text = text.replace("z", "+").replace("Z", "+").replace("'", "")
|
||||
i = max(text.find("+"), text.find("-"))
|
||||
if i > 0 and i != len(text) - 5:
|
||||
text += "00"
|
||||
for f in (
|
||||
"D:%Y",
|
||||
"D:%Y%m",
|
||||
"D:%Y%m%d",
|
||||
"D:%Y%m%d%H",
|
||||
"D:%Y%m%d%H%M",
|
||||
"D:%Y%m%d%H%M%S",
|
||||
"D:%Y%m%d%H%M%S%z",
|
||||
):
|
||||
try:
|
||||
d = datetime.strptime(text, f) # noqa: DTZ007
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
if text.endswith("+0000"):
|
||||
d = d.replace(tzinfo=timezone.utc)
|
||||
return d
|
||||
raise ValueError(f"Can not convert date: {orgtext}")
|
||||
|
||||
|
||||
def _get_max_pdf_version_header(header1: str, header2: str) -> str:
|
||||
versions = (
|
||||
"%PDF-1.3",
|
||||
"%PDF-1.4",
|
||||
"%PDF-1.5",
|
||||
"%PDF-1.6",
|
||||
"%PDF-1.7",
|
||||
"%PDF-2.0",
|
||||
)
|
||||
pdf_header_indices = []
|
||||
if header1 in versions:
|
||||
pdf_header_indices.append(versions.index(header1))
|
||||
if header2 in versions:
|
||||
pdf_header_indices.append(versions.index(header2))
|
||||
if len(pdf_header_indices) == 0:
|
||||
raise ValueError(f"Neither {header1!r} nor {header2!r} are proper headers")
|
||||
return versions[max(pdf_header_indices)]
|
||||
|
||||
|
||||
WHITESPACES = (b"\x00", b"\t", b"\n", b"\f", b"\r", b" ")
|
||||
WHITESPACES_AS_BYTES = b"".join(WHITESPACES)
|
||||
WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]"
|
||||
|
||||
|
||||
def read_until_whitespace(stream: StreamType, maxchars: Optional[int] = None) -> bytes:
|
||||
"""
|
||||
Read non-whitespace characters and return them.
|
||||
|
||||
Stops upon encountering whitespace or when maxchars is reached.
|
||||
|
||||
Args:
|
||||
stream: The data stream from which was read.
|
||||
maxchars: The maximum number of bytes returned; by default unlimited.
|
||||
|
||||
Returns:
|
||||
The data which was read.
|
||||
|
||||
"""
|
||||
txt = b""
|
||||
while True:
|
||||
tok = stream.read(1)
|
||||
if tok.isspace() or not tok:
|
||||
break
|
||||
txt += tok
|
||||
if len(txt) == maxchars:
|
||||
break
|
||||
return txt
|
||||
|
||||
|
||||
def read_non_whitespace(stream: StreamType) -> bytes:
|
||||
"""
|
||||
Find and read the next non-whitespace character (ignores whitespace).
|
||||
|
||||
Args:
|
||||
stream: The data stream from which was read.
|
||||
|
||||
Returns:
|
||||
The data which was read.
|
||||
|
||||
"""
|
||||
tok = stream.read(1)
|
||||
while tok in WHITESPACES:
|
||||
tok = stream.read(1)
|
||||
return tok
|
||||
|
||||
|
||||
def skip_over_whitespace(stream: StreamType) -> bool:
|
||||
"""
|
||||
Similar to read_non_whitespace, but return a boolean if at least one
|
||||
whitespace character was read.
|
||||
|
||||
Args:
|
||||
stream: The data stream from which was read.
|
||||
|
||||
Returns:
|
||||
True if one or more whitespace was skipped, otherwise return False.
|
||||
|
||||
"""
|
||||
tok = stream.read(1)
|
||||
cnt = 0
|
||||
while tok in WHITESPACES:
|
||||
cnt += 1
|
||||
tok = stream.read(1)
|
||||
return cnt > 0
|
||||
|
||||
|
||||
def check_if_whitespace_only(value: bytes) -> bool:
|
||||
"""
|
||||
Check if the given value consists of whitespace characters only.
|
||||
|
||||
Args:
|
||||
value: The bytes to check.
|
||||
|
||||
Returns:
|
||||
True if the value only has whitespace characters, otherwise return False.
|
||||
|
||||
"""
|
||||
return all(b in WHITESPACES_AS_BYTES for b in value)
|
||||
|
||||
|
||||
def skip_over_comment(stream: StreamType) -> None:
|
||||
tok = stream.read(1)
|
||||
stream.seek(-1, 1)
|
||||
if tok == b"%":
|
||||
while tok not in (b"\n", b"\r"):
|
||||
tok = stream.read(1)
|
||||
if tok == b"":
|
||||
raise PdfStreamError("File ended unexpectedly.")
|
||||
|
||||
|
||||
def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:
|
||||
"""
|
||||
Read until the regular expression pattern matched (ignore the match).
|
||||
Treats EOF on the underlying stream as the end of the token to be matched.
|
||||
|
||||
Args:
|
||||
regex: re.Pattern
|
||||
|
||||
Returns:
|
||||
The read bytes.
|
||||
|
||||
"""
|
||||
name = b""
|
||||
while True:
|
||||
tok = stream.read(16)
|
||||
if not tok:
|
||||
return name
|
||||
m = regex.search(name + tok)
|
||||
if m is not None:
|
||||
stream.seek(m.start() - (len(name) + len(tok)), 1)
|
||||
name = (name + tok)[: m.start()]
|
||||
break
|
||||
name += tok
|
||||
return name
|
||||
|
||||
|
||||
def read_block_backwards(stream: StreamType, to_read: int) -> bytes:
|
||||
"""
|
||||
Given a stream at position X, read a block of size to_read ending at position X.
|
||||
|
||||
This changes the stream's position to the beginning of where the block was
|
||||
read.
|
||||
|
||||
Args:
|
||||
stream:
|
||||
to_read:
|
||||
|
||||
Returns:
|
||||
The data which was read.
|
||||
|
||||
"""
|
||||
if stream.tell() < to_read:
|
||||
raise PdfStreamError("Could not read malformed PDF file")
|
||||
# Seek to the start of the block we want to read.
|
||||
stream.seek(-to_read, SEEK_CUR)
|
||||
read = stream.read(to_read)
|
||||
# Seek to the start of the block we read after reading it.
|
||||
stream.seek(-to_read, SEEK_CUR)
|
||||
return read
|
||||
|
||||
|
||||
def read_previous_line(stream: StreamType) -> bytes:
|
||||
"""
|
||||
Given a byte stream with current position X, return the previous line.
|
||||
|
||||
All characters between the first CR/LF byte found before X
|
||||
(or, the start of the file, if no such byte is found) and position X
|
||||
After this call, the stream will be positioned one byte after the
|
||||
first non-CRLF character found beyond the first CR/LF byte before X,
|
||||
or, if no such byte is found, at the beginning of the stream.
|
||||
|
||||
Args:
|
||||
stream: StreamType:
|
||||
|
||||
Returns:
|
||||
The data which was read.
|
||||
|
||||
"""
|
||||
line_content = []
|
||||
found_crlf = False
|
||||
if stream.tell() == 0:
|
||||
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
|
||||
while True:
|
||||
to_read = min(DEFAULT_BUFFER_SIZE, stream.tell())
|
||||
if to_read == 0:
|
||||
break
|
||||
# Read the block. After this, our stream will be one
|
||||
# beyond the initial position.
|
||||
block = read_block_backwards(stream, to_read)
|
||||
idx = len(block) - 1
|
||||
if not found_crlf:
|
||||
# We haven't found our first CR/LF yet.
|
||||
# Read off characters until we hit one.
|
||||
while idx >= 0 and block[idx] not in b"\r\n":
|
||||
idx -= 1
|
||||
if idx >= 0:
|
||||
found_crlf = True
|
||||
if found_crlf:
|
||||
# We found our first CR/LF already (on this block or
|
||||
# a previous one).
|
||||
# Our combined line is the remainder of the block
|
||||
# plus any previously read blocks.
|
||||
line_content.append(block[idx + 1 :])
|
||||
# Continue to read off any more CRLF characters.
|
||||
while idx >= 0 and block[idx] in b"\r\n":
|
||||
idx -= 1
|
||||
else:
|
||||
# Didn't find CR/LF yet - add this block to our
|
||||
# previously read blocks and continue.
|
||||
line_content.append(block)
|
||||
if idx >= 0:
|
||||
# We found the next non-CRLF character.
|
||||
# Set the stream position correctly, then break
|
||||
stream.seek(idx + 1, SEEK_CUR)
|
||||
break
|
||||
# Join all the blocks in the line (which are in reverse order)
|
||||
return b"".join(line_content[::-1])
|
||||
|
||||
|
||||
def matrix_multiply(
|
||||
a: TransformationMatrixType, b: TransformationMatrixType
|
||||
) -> TransformationMatrixType:
|
||||
return tuple( # type: ignore[return-value]
|
||||
tuple(sum(float(i) * float(j) for i, j in zip(row, col)) for col in zip(*b))
|
||||
for row in a
|
||||
)
|
||||
|
||||
|
||||
def mark_location(stream: StreamType) -> None:
|
||||
"""Create text file showing current location in context."""
|
||||
# Mainly for debugging
|
||||
radius = 5000
|
||||
stream.seek(-radius, 1)
|
||||
with open("pypdf_pdfLocation.txt", "wb") as output_fh:
|
||||
output_fh.write(stream.read(radius))
|
||||
output_fh.write(b"HERE")
|
||||
output_fh.write(stream.read(radius))
|
||||
stream.seek(-radius, 1)
|
||||
|
||||
|
||||
@overload
|
||||
def ord_(b: str) -> int:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def ord_(b: bytes) -> bytes:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def ord_(b: int) -> int:
|
||||
...
|
||||
|
||||
|
||||
def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:
|
||||
if isinstance(b, str):
|
||||
return ord(b)
|
||||
return b
|
||||
|
||||
|
||||
def deprecate(msg: str, stacklevel: int = 3) -> None:
|
||||
warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel)
|
||||
|
||||
|
||||
def deprecation(msg: str) -> None:
|
||||
raise DeprecationError(msg)
|
||||
|
||||
|
||||
def deprecate_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
|
||||
"""Issue a warning that a feature will be removed, but has a replacement."""
|
||||
deprecate(
|
||||
f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.",
|
||||
4,
|
||||
)
|
||||
|
||||
|
||||
def deprecation_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
|
||||
"""Raise an exception that a feature was already removed, but has a replacement."""
|
||||
deprecation(
|
||||
f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead."
|
||||
)
|
||||
|
||||
|
||||
def deprecate_no_replacement(name: str, removed_in: str) -> None:
|
||||
"""Issue a warning that a feature will be removed without replacement."""
|
||||
deprecate(f"{name} is deprecated and will be removed in pypdf {removed_in}.", 4)
|
||||
|
||||
|
||||
def deprecation_no_replacement(name: str, removed_in: str) -> None:
|
||||
"""Raise an exception that a feature was already removed without replacement."""
|
||||
deprecation(f"{name} is deprecated and was removed in pypdf {removed_in}.")
|
||||
|
||||
|
||||
def logger_error(msg: str, src: str) -> None:
|
||||
"""
|
||||
Use this instead of logger.error directly.
|
||||
|
||||
That allows people to overwrite it more easily.
|
||||
|
||||
See the docs on when to use which:
|
||||
https://pypdf.readthedocs.io/en/latest/user/suppress-warnings.html
|
||||
"""
|
||||
logging.getLogger(src).error(msg)
|
||||
|
||||
|
||||
def logger_warning(msg: str, src: str) -> None:
|
||||
"""
|
||||
Use this instead of logger.warning directly.
|
||||
|
||||
That allows people to overwrite it more easily.
|
||||
|
||||
## Exception, warnings.warn, logger_warning
|
||||
- Exceptions should be used if the user should write code that deals with
|
||||
an error case, e.g. the PDF being completely broken.
|
||||
- warnings.warn should be used if the user needs to fix their code, e.g.
|
||||
DeprecationWarnings
|
||||
- logger_warning should be used if the user needs to know that an issue was
|
||||
handled by pypdf, e.g. a non-compliant PDF being read in a way that
|
||||
pypdf could apply a robustness fix to still read it. This applies mainly
|
||||
to strict=False mode.
|
||||
"""
|
||||
logging.getLogger(src).warning(msg)
|
||||
|
||||
|
||||
def rename_kwargs(
|
||||
func_name: str, kwargs: Dict[str, Any], aliases: Dict[str, str], fail: bool = False
|
||||
) -> None:
|
||||
"""
|
||||
Helper function to deprecate arguments.
|
||||
|
||||
Args:
|
||||
func_name: Name of the function to be deprecated
|
||||
kwargs:
|
||||
aliases:
|
||||
fail:
|
||||
|
||||
"""
|
||||
for old_term, new_term in aliases.items():
|
||||
if old_term in kwargs:
|
||||
if fail:
|
||||
raise DeprecationError(
|
||||
f"{old_term} is deprecated as an argument. Use {new_term} instead"
|
||||
)
|
||||
if new_term in kwargs:
|
||||
raise TypeError(
|
||||
f"{func_name} received both {old_term} and {new_term} as "
|
||||
f"an argument. {old_term} is deprecated. "
|
||||
f"Use {new_term} instead."
|
||||
)
|
||||
kwargs[new_term] = kwargs.pop(old_term)
|
||||
warnings.warn(
|
||||
message=(
|
||||
f"{old_term} is deprecated as an argument. Use {new_term} instead"
|
||||
),
|
||||
category=DeprecationWarning,
|
||||
)
|
||||
|
||||
|
||||
def _human_readable_bytes(bytes: int) -> str:
|
||||
if bytes < 10**3:
|
||||
return f"{bytes} Byte"
|
||||
elif bytes < 10**6:
|
||||
return f"{bytes / 10**3:.1f} kB"
|
||||
elif bytes < 10**9:
|
||||
return f"{bytes / 10**6:.1f} MB"
|
||||
else:
|
||||
return f"{bytes / 10**9:.1f} GB"
|
||||
|
||||
|
||||
# The following class has been copied from Django:
|
||||
# https://github.com/django/django/blob/adae619426b6f50046b3daaa744db52989c9d6db/django/utils/functional.py#L51-L65
|
||||
# It received some modifications to comply with our own coding standards.
|
||||
#
|
||||
# Original license:
|
||||
#
|
||||
# ---------------------------------------------------------------------------------
|
||||
# Copyright (c) Django Software Foundation and individual contributors.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification,
|
||||
# are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of Django nor the names of its contributors may be used
|
||||
# to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
# ---------------------------------------------------------------------------------
|
||||
class classproperty: # noqa: N801
|
||||
"""
|
||||
Decorator that converts a method with a single cls argument into a property
|
||||
that can be accessed directly from the class.
|
||||
"""
|
||||
|
||||
def __init__(self, method=None) -> None: # type: ignore # noqa: ANN001
|
||||
self.fget = method
|
||||
|
||||
def __get__(self, instance, cls=None) -> Any: # type: ignore # noqa: ANN001
|
||||
return self.fget(cls)
|
||||
|
||||
def getter(self, method) -> Self: # type: ignore # noqa: ANN001
|
||||
self.fget = method
|
||||
return self
|
||||
|
||||
|
||||
@dataclass
|
||||
class File:
|
||||
from .generic import IndirectObject
|
||||
|
||||
name: str = ""
|
||||
"""
|
||||
Filename as identified within the PDF file.
|
||||
"""
|
||||
data: bytes = b""
|
||||
"""
|
||||
Data as bytes.
|
||||
"""
|
||||
indirect_reference: Optional[IndirectObject] = None
|
||||
"""
|
||||
Reference to the object storing the stream.
|
||||
"""
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.__class__.__name__}(name={self.name}, data: {_human_readable_bytes(len(self.data))})"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.__str__()[:-1] + f", hash: {hash(self.data)})"
|
||||
|
||||
|
||||
@functools.total_ordering
|
||||
class Version:
|
||||
COMPONENT_PATTERN = re.compile(r"^(\d+)(.*)$")
|
||||
|
||||
def __init__(self, version_str: str) -> None:
|
||||
self.version_str = version_str
|
||||
self.components = self._parse_version(version_str)
|
||||
|
||||
def _parse_version(self, version_str: str) -> List[Tuple[int, str]]:
|
||||
components = version_str.split(".")
|
||||
parsed_components = []
|
||||
for component in components:
|
||||
match = Version.COMPONENT_PATTERN.match(component)
|
||||
if not match:
|
||||
parsed_components.append((0, component))
|
||||
continue
|
||||
integer_prefix = match.group(1)
|
||||
suffix = match.group(2)
|
||||
if integer_prefix is None:
|
||||
integer_prefix = 0
|
||||
parsed_components.append((int(integer_prefix), suffix))
|
||||
return parsed_components
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, Version):
|
||||
return False
|
||||
return self.components == other.components
|
||||
|
||||
def __lt__(self, other: Any) -> bool:
|
||||
if not isinstance(other, Version):
|
||||
raise ValueError(f"Version cannot be compared against {type(other)}")
|
||||
|
||||
for self_component, other_component in zip(self.components, other.components):
|
||||
self_value, self_suffix = self_component
|
||||
other_value, other_suffix = other_component
|
||||
|
||||
if self_value < other_value:
|
||||
return True
|
||||
elif self_value > other_value:
|
||||
return False
|
||||
|
||||
if self_suffix < other_suffix:
|
||||
return True
|
||||
elif self_suffix > other_suffix:
|
||||
return False
|
||||
|
||||
return len(self.components) < len(other.components)
|
||||
1
venv/lib/python3.12/site-packages/pypdf/_version.py
Normal file
1
venv/lib/python3.12/site-packages/pypdf/_version.py
Normal file
@ -0,0 +1 @@
|
||||
__version__ = "5.4.0"
|
||||
3380
venv/lib/python3.12/site-packages/pypdf/_writer.py
Normal file
3380
venv/lib/python3.12/site-packages/pypdf/_writer.py
Normal file
File diff suppressed because it is too large
Load Diff
379
venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py
Normal file
379
venv/lib/python3.12/site-packages/pypdf/_xobj_image_helpers.py
Normal file
@ -0,0 +1,379 @@
|
||||
"""Code in here is only used by pypdf.filters._xobj_to_image"""
|
||||
|
||||
import sys
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Literal, Tuple, Union, cast
|
||||
|
||||
from ._utils import check_if_whitespace_only, logger_warning
|
||||
from .constants import ColorSpaces
|
||||
from .constants import FilterTypes as FT
|
||||
from .constants import ImageAttributes as IA
|
||||
from .errors import EmptyImageDataError, PdfReadError
|
||||
from .generic import (
|
||||
ArrayObject,
|
||||
DecodedStreamObject,
|
||||
EncodedStreamObject,
|
||||
IndirectObject,
|
||||
NullObject,
|
||||
TextStringObject,
|
||||
)
|
||||
|
||||
if sys.version_info[:2] >= (3, 10):
|
||||
from typing import TypeAlias
|
||||
else:
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
|
||||
try:
|
||||
from PIL import Image, UnidentifiedImageError # noqa: F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pillow is required to do image extraction. "
|
||||
"It can be installed via 'pip install pypdf[image]'"
|
||||
)
|
||||
|
||||
mode_str_type: TypeAlias = Literal[
|
||||
"", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK"
|
||||
]
|
||||
|
||||
MAX_IMAGE_MODE_NESTING_DEPTH: int = 10
|
||||
|
||||
|
||||
def _get_imagemode(
|
||||
color_space: Union[str, List[Any], Any],
|
||||
color_components: int,
|
||||
prev_mode: mode_str_type,
|
||||
depth: int = 0,
|
||||
) -> Tuple[mode_str_type, bool]:
|
||||
"""
|
||||
Returns:
|
||||
Image mode, not taking into account mask (transparency).
|
||||
ColorInversion is required (like for some DeviceCMYK).
|
||||
|
||||
"""
|
||||
if depth > MAX_IMAGE_MODE_NESTING_DEPTH:
|
||||
raise PdfReadError(
|
||||
"Color spaces nested too deeply. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH."
|
||||
)
|
||||
if isinstance(color_space, NullObject):
|
||||
return "", False
|
||||
if isinstance(color_space, str):
|
||||
pass
|
||||
elif not isinstance(color_space, list):
|
||||
raise PdfReadError(
|
||||
"Cannot interpret color space", color_space
|
||||
) # pragma: no cover
|
||||
elif color_space[0].startswith("/Cal"): # /CalRGB and /CalGray
|
||||
color_space = "/Device" + color_space[0][4:]
|
||||
elif color_space[0] == "/ICCBased":
|
||||
icc_profile = color_space[1].get_object()
|
||||
color_components = cast(int, icc_profile["/N"])
|
||||
color_space = icc_profile.get("/Alternate", "")
|
||||
elif color_space[0] == "/Indexed":
|
||||
color_space = color_space[1].get_object()
|
||||
mode, invert_color = _get_imagemode(
|
||||
color_space, color_components, prev_mode, depth + 1
|
||||
)
|
||||
if mode in ("RGB", "CMYK"):
|
||||
mode = "P"
|
||||
return mode, invert_color
|
||||
elif color_space[0] == "/Separation":
|
||||
color_space = color_space[2]
|
||||
if isinstance(color_space, IndirectObject):
|
||||
color_space = color_space.get_object()
|
||||
mode, invert_color = _get_imagemode(
|
||||
color_space, color_components, prev_mode, depth + 1
|
||||
)
|
||||
return mode, True
|
||||
elif color_space[0] == "/DeviceN":
|
||||
original_color_space = color_space
|
||||
color_components = len(color_space[1])
|
||||
color_space = color_space[2]
|
||||
if isinstance(color_space, IndirectObject): # pragma: no cover
|
||||
color_space = color_space.get_object()
|
||||
if color_space == "/DeviceCMYK" and color_components == 1:
|
||||
if original_color_space[1][0] != "/Black":
|
||||
logger_warning(
|
||||
f"Color {original_color_space[1][0]} converted to Gray. Please share PDF with pypdf dev team",
|
||||
__name__,
|
||||
)
|
||||
return "L", True
|
||||
mode, invert_color = _get_imagemode(
|
||||
color_space, color_components, prev_mode, depth + 1
|
||||
)
|
||||
return mode, invert_color
|
||||
|
||||
mode_map: Dict[str, mode_str_type] = {
|
||||
"1bit": "1", # must be zeroth position: color_components may index the values
|
||||
"/DeviceGray": "L", # must be first position: color_components may index the values
|
||||
"palette": "P", # must be second position: color_components may index the values
|
||||
"/DeviceRGB": "RGB", # must be third position: color_components may index the values
|
||||
"/DeviceCMYK": "CMYK", # must be fourth position: color_components may index the values
|
||||
"2bit": "2bits",
|
||||
"4bit": "4bits",
|
||||
}
|
||||
|
||||
mode = (
|
||||
mode_map.get(color_space)
|
||||
or list(mode_map.values())[color_components]
|
||||
or prev_mode
|
||||
)
|
||||
|
||||
return mode, mode == "CMYK"
|
||||
|
||||
|
||||
def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
|
||||
mask = (1 << bits) - 1
|
||||
byte_buffer = bytearray(size[0] * size[1])
|
||||
data_index = 0
|
||||
bit = 8 - bits
|
||||
for y in range(size[1]):
|
||||
if bit != 8 - bits:
|
||||
data_index += 1
|
||||
bit = 8 - bits
|
||||
for x in range(size[0]):
|
||||
byte_buffer[x + y * size[0]] = (data[data_index] >> bit) & mask
|
||||
bit -= bits
|
||||
if bit < 0:
|
||||
data_index += 1
|
||||
bit = 8 - bits
|
||||
return bytes(byte_buffer)
|
||||
|
||||
|
||||
def _extended_image_frombytes(
|
||||
mode: str, size: Tuple[int, int], data: bytes
|
||||
) -> Image.Image:
|
||||
try:
|
||||
img = Image.frombytes(mode, size, data)
|
||||
except ValueError as exc:
|
||||
nb_pix = size[0] * size[1]
|
||||
data_length = len(data)
|
||||
if data_length == 0:
|
||||
raise EmptyImageDataError(
|
||||
"Data is 0 bytes, cannot process an image from empty data."
|
||||
) from exc
|
||||
if data_length % nb_pix != 0:
|
||||
raise exc
|
||||
k = nb_pix * len(mode) / data_length
|
||||
data = b"".join(bytes((x,) * int(k)) for x in data)
|
||||
img = Image.frombytes(mode, size, data)
|
||||
return img
|
||||
|
||||
|
||||
def _handle_flate(
|
||||
size: Tuple[int, int],
|
||||
data: bytes,
|
||||
mode: mode_str_type,
|
||||
color_space: str,
|
||||
colors: int,
|
||||
obj_as_text: str,
|
||||
) -> Tuple[Image.Image, str, str, bool]:
|
||||
"""
|
||||
Process image encoded in flateEncode
|
||||
Returns img, image_format, extension, color inversion
|
||||
"""
|
||||
extension = ".png" # mime_type = "image/png"
|
||||
image_format = "PNG"
|
||||
lookup: Any
|
||||
base: Any
|
||||
hival: Any
|
||||
if isinstance(color_space, ArrayObject) and color_space[0] == "/Indexed":
|
||||
color_space, base, hival, lookup = (value.get_object() for value in color_space)
|
||||
if mode == "2bits":
|
||||
mode = "P"
|
||||
data = bits2byte(data, size, 2)
|
||||
elif mode == "4bits":
|
||||
mode = "P"
|
||||
data = bits2byte(data, size, 4)
|
||||
img = _extended_image_frombytes(mode, size, data)
|
||||
if color_space == "/Indexed":
|
||||
if isinstance(lookup, (EncodedStreamObject, DecodedStreamObject)):
|
||||
lookup = lookup.get_data()
|
||||
if isinstance(lookup, TextStringObject):
|
||||
lookup = lookup.original_bytes
|
||||
if isinstance(lookup, str):
|
||||
lookup = lookup.encode()
|
||||
try:
|
||||
nb, conv, mode = { # type: ignore
|
||||
"1": (0, "", ""),
|
||||
"L": (1, "P", "L"),
|
||||
"P": (0, "", ""),
|
||||
"RGB": (3, "P", "RGB"),
|
||||
"CMYK": (4, "P", "CMYK"),
|
||||
}[_get_imagemode(base, 0, "")[0]]
|
||||
except KeyError: # pragma: no cover
|
||||
logger_warning(
|
||||
f"Base {base} not coded please share the pdf file with pypdf dev team",
|
||||
__name__,
|
||||
)
|
||||
lookup = None
|
||||
else:
|
||||
if img.mode == "1":
|
||||
# Two values ("high" and "low").
|
||||
expected_count = 2 * nb
|
||||
actual_count = len(lookup)
|
||||
if actual_count != expected_count:
|
||||
if actual_count < expected_count:
|
||||
logger_warning(
|
||||
f"Not enough lookup values: Expected {expected_count}, got {actual_count}.",
|
||||
__name__
|
||||
)
|
||||
lookup += bytes([0] * (expected_count - actual_count))
|
||||
elif not check_if_whitespace_only(lookup[expected_count:]):
|
||||
logger_warning(
|
||||
f"Too many lookup values: Expected {expected_count}, got {actual_count}.",
|
||||
__name__
|
||||
)
|
||||
lookup = lookup[:expected_count]
|
||||
colors_arr = [lookup[:nb], lookup[nb:]]
|
||||
arr = b"".join(
|
||||
b"".join(
|
||||
colors_arr[1 if img.getpixel((x, y)) > 127 else 0]
|
||||
for x in range(img.size[0])
|
||||
)
|
||||
for y in range(img.size[1])
|
||||
)
|
||||
img = Image.frombytes(mode, img.size, arr)
|
||||
else:
|
||||
img = img.convert(conv)
|
||||
if len(lookup) != (hival + 1) * nb:
|
||||
logger_warning(f"Invalid Lookup Table in {obj_as_text}", __name__)
|
||||
lookup = None
|
||||
elif mode == "L":
|
||||
# gray lookup does not work : it is converted to a similar RGB lookup
|
||||
lookup = b"".join([bytes([b, b, b]) for b in lookup])
|
||||
mode = "RGB"
|
||||
# TODO : cf https://github.com/py-pdf/pypdf/pull/2039
|
||||
# this is a work around until PIL is able to process CMYK images
|
||||
elif mode == "CMYK":
|
||||
_rgb = []
|
||||
for _c, _m, _y, _k in (
|
||||
lookup[n : n + 4] for n in range(0, 4 * (len(lookup) // 4), 4)
|
||||
):
|
||||
_r = int(255 * (1 - _c / 255) * (1 - _k / 255))
|
||||
_g = int(255 * (1 - _m / 255) * (1 - _k / 255))
|
||||
_b = int(255 * (1 - _y / 255) * (1 - _k / 255))
|
||||
_rgb.append(bytes((_r, _g, _b)))
|
||||
lookup = b"".join(_rgb)
|
||||
mode = "RGB"
|
||||
if lookup is not None:
|
||||
img.putpalette(lookup, rawmode=mode)
|
||||
img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB")
|
||||
elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased":
|
||||
# see Table 66 - Additional Entries Specific to an ICC Profile
|
||||
# Stream Dictionary
|
||||
mode2 = _get_imagemode(color_space, colors, mode)[0]
|
||||
if mode != mode2:
|
||||
img = Image.frombytes(mode2, size, data) # reloaded as mode may have change
|
||||
if mode == "CMYK":
|
||||
extension = ".tif"
|
||||
image_format = "TIFF"
|
||||
return img, image_format, extension, False
|
||||
|
||||
|
||||
def _handle_jpx(
|
||||
size: Tuple[int, int],
|
||||
data: bytes,
|
||||
mode: mode_str_type,
|
||||
color_space: str,
|
||||
colors: int,
|
||||
) -> Tuple[Image.Image, str, str, bool]:
|
||||
"""
|
||||
Process image encoded in flateEncode
|
||||
Returns img, image_format, extension, inversion
|
||||
"""
|
||||
extension = ".jp2" # mime_type = "image/x-jp2"
|
||||
img1 = Image.open(BytesIO(data), formats=("JPEG2000",))
|
||||
mode, invert_color = _get_imagemode(color_space, colors, mode)
|
||||
if mode == "":
|
||||
mode = cast(mode_str_type, img1.mode)
|
||||
invert_color = mode in ("CMYK",)
|
||||
if img1.mode == "RGBA" and mode == "RGB":
|
||||
mode = "RGBA"
|
||||
# we need to convert to the good mode
|
||||
if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unordered) sets
|
||||
# L and P are indexed modes which should not be changed.
|
||||
img = img1
|
||||
elif {img1.mode, mode} == {"RGBA", "CMYK"}:
|
||||
# RGBA / CMYK are 4bytes encoding where
|
||||
# the encoding should be corrected
|
||||
img = Image.frombytes(mode, img1.size, img1.tobytes())
|
||||
else: # pragma: no cover
|
||||
img = img1.convert(mode)
|
||||
# for CMYK conversion :
|
||||
# https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop
|
||||
# not implemented for the moment as I need to get properly the ICC
|
||||
if img.mode == "CMYK":
|
||||
img = img.convert("RGB")
|
||||
image_format = "JPEG2000"
|
||||
return img, image_format, extension, invert_color
|
||||
|
||||
|
||||
def _apply_decode(
|
||||
img: Image.Image,
|
||||
x_object_obj: Dict[str, Any],
|
||||
lfilters: FT,
|
||||
color_space: Union[str, List[Any], Any],
|
||||
invert_color: bool,
|
||||
) -> Image.Image:
|
||||
# CMYK image and other color spaces without decode
|
||||
# requires reverting scale (cf p243,2§ last sentence)
|
||||
decode = x_object_obj.get(
|
||||
IA.DECODE,
|
||||
([1.0, 0.0] * len(img.getbands()))
|
||||
if (
|
||||
(img.mode == "CMYK" and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE))
|
||||
or (invert_color and img.mode == "L")
|
||||
)
|
||||
else None,
|
||||
)
|
||||
if (
|
||||
isinstance(color_space, ArrayObject)
|
||||
and color_space[0].get_object() == "/Indexed"
|
||||
):
|
||||
decode = None # decode is meaningless if Indexed
|
||||
if (
|
||||
isinstance(color_space, ArrayObject)
|
||||
and color_space[0].get_object() == "/Separation"
|
||||
):
|
||||
decode = [1.0, 0.0] * len(img.getbands())
|
||||
if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))):
|
||||
lut: List[int] = []
|
||||
for i in range(0, len(decode), 2):
|
||||
dmin = decode[i]
|
||||
dmax = decode[i + 1]
|
||||
lut.extend(
|
||||
round(255.0 * (j / 255.0 * (dmax - dmin) + dmin)) for j in range(256)
|
||||
)
|
||||
img = img.point(lut)
|
||||
return img
|
||||
|
||||
|
||||
def _get_mode_and_invert_color(
|
||||
x_object_obj: Dict[str, Any], colors: int, color_space: Union[str, List[Any], Any]
|
||||
) -> Tuple[mode_str_type, bool]:
|
||||
if (
|
||||
IA.COLOR_SPACE in x_object_obj
|
||||
and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
|
||||
):
|
||||
# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
|
||||
mode: mode_str_type = "RGB"
|
||||
if x_object_obj.get("/BitsPerComponent", 8) < 8:
|
||||
mode, invert_color = _get_imagemode(
|
||||
f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, ""
|
||||
)
|
||||
else:
|
||||
mode, invert_color = _get_imagemode(
|
||||
color_space,
|
||||
2
|
||||
if (
|
||||
colors == 1
|
||||
and (
|
||||
not isinstance(color_space, NullObject)
|
||||
and "Gray" not in color_space
|
||||
)
|
||||
)
|
||||
else colors,
|
||||
"",
|
||||
)
|
||||
return mode, invert_color
|
||||
@ -0,0 +1,42 @@
|
||||
"""
|
||||
PDF specifies several annotation types which pypdf makes available here.
|
||||
|
||||
The names of the annotations and their attributes do not reflect the names in
|
||||
the specification in all cases. For example, the PDF standard defines a
|
||||
'Square' annotation that does not actually need to be square. For this reason,
|
||||
pypdf calls it 'Rectangle'.
|
||||
|
||||
At their core, all annotation types are DictionaryObjects. That means if pypdf
|
||||
does not implement a feature, users can easily extend the given functionality.
|
||||
"""
|
||||
|
||||
|
||||
from ._base import NO_FLAGS, AnnotationDictionary
|
||||
from ._markup_annotations import (
|
||||
Ellipse,
|
||||
FreeText,
|
||||
Highlight,
|
||||
Line,
|
||||
MarkupAnnotation,
|
||||
Polygon,
|
||||
PolyLine,
|
||||
Rectangle,
|
||||
Text,
|
||||
)
|
||||
from ._non_markup_annotations import Link, Popup
|
||||
|
||||
__all__ = [
|
||||
"NO_FLAGS",
|
||||
"AnnotationDictionary",
|
||||
"Ellipse",
|
||||
"FreeText",
|
||||
"Highlight",
|
||||
"Line",
|
||||
"Link",
|
||||
"MarkupAnnotation",
|
||||
"PolyLine",
|
||||
"Polygon",
|
||||
"Popup",
|
||||
"Rectangle",
|
||||
"Text",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
27
venv/lib/python3.12/site-packages/pypdf/annotations/_base.py
Normal file
27
venv/lib/python3.12/site-packages/pypdf/annotations/_base.py
Normal file
@ -0,0 +1,27 @@
|
||||
from abc import ABC
|
||||
|
||||
from ..constants import AnnotationFlag
|
||||
from ..generic import NameObject, NumberObject
|
||||
from ..generic._data_structures import DictionaryObject
|
||||
|
||||
|
||||
class AnnotationDictionary(DictionaryObject, ABC):
|
||||
def __init__(self) -> None:
|
||||
from ..generic._base import NameObject
|
||||
|
||||
# /Rect should not be added here as Polygon and PolyLine can automatically set it
|
||||
self[NameObject("/Type")] = NameObject("/Annot")
|
||||
# The flags were NOT added to the constructor on purpose:
|
||||
# We expect that most users don't want to change the default.
|
||||
# If they do, they can use the property. The default is 0.
|
||||
|
||||
@property
|
||||
def flags(self) -> AnnotationFlag:
|
||||
return self.get(NameObject("/F"), AnnotationFlag(0))
|
||||
|
||||
@flags.setter
|
||||
def flags(self, value: AnnotationFlag) -> None:
|
||||
self[NameObject("/F")] = NumberObject(value)
|
||||
|
||||
|
||||
NO_FLAGS = AnnotationFlag(0)
|
||||
@ -0,0 +1,315 @@
|
||||
import sys
|
||||
from abc import ABC
|
||||
from typing import Any, List, Optional, Tuple, Union
|
||||
|
||||
from .._utils import deprecation_with_replacement
|
||||
from ..constants import AnnotationFlag
|
||||
from ..generic import ArrayObject, DictionaryObject
|
||||
from ..generic._base import (
|
||||
BooleanObject,
|
||||
FloatObject,
|
||||
NameObject,
|
||||
NumberObject,
|
||||
TextStringObject,
|
||||
)
|
||||
from ..generic._rectangle import RectangleObject
|
||||
from ..generic._utils import hex_to_rgb
|
||||
from ._base import NO_FLAGS, AnnotationDictionary
|
||||
|
||||
if sys.version_info[:2] >= (3, 10):
|
||||
from typing import TypeAlias
|
||||
else:
|
||||
# PEP 613 introduced typing.TypeAlias with Python 3.10
|
||||
# For older Python versions, the backport typing_extensions is necessary:
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
|
||||
Vertex: TypeAlias = Tuple[float, float]
|
||||
|
||||
|
||||
def _get_bounding_rectangle(vertices: List[Vertex]) -> RectangleObject:
|
||||
x_min, y_min = vertices[0][0], vertices[0][1]
|
||||
x_max, y_max = vertices[0][0], vertices[0][1]
|
||||
for x, y in vertices:
|
||||
x_min = min(x_min, x)
|
||||
y_min = min(y_min, y)
|
||||
x_max = max(x_max, x)
|
||||
y_max = max(y_max, y)
|
||||
rect = RectangleObject((x_min, y_min, x_max, y_max))
|
||||
return rect
|
||||
|
||||
|
||||
class MarkupAnnotation(AnnotationDictionary, ABC):
|
||||
"""
|
||||
Base class for all markup annotations.
|
||||
|
||||
Args:
|
||||
title_bar: Text to be displayed in the title bar of the annotation;
|
||||
by convention this is the name of the author
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *, title_bar: Optional[str] = None) -> None:
|
||||
if title_bar is not None:
|
||||
self[NameObject("/T")] = TextStringObject(title_bar)
|
||||
|
||||
|
||||
class Text(MarkupAnnotation):
|
||||
"""
|
||||
A text annotation.
|
||||
|
||||
Args:
|
||||
rect: array of four integers ``[xLL, yLL, xUR, yUR]``
|
||||
specifying the clickable rectangular area
|
||||
text: The text that is added to the document
|
||||
open:
|
||||
flags:
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
text: str,
|
||||
open: bool = False,
|
||||
flags: int = NO_FLAGS,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self[NameObject("/Subtype")] = NameObject("/Text")
|
||||
self[NameObject("/Rect")] = RectangleObject(rect)
|
||||
self[NameObject("/Contents")] = TextStringObject(text)
|
||||
self[NameObject("/Open")] = BooleanObject(open)
|
||||
self[NameObject("/Flags")] = NumberObject(flags)
|
||||
|
||||
|
||||
class FreeText(MarkupAnnotation):
|
||||
"""A FreeText annotation"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
text: str,
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
font: str = "Helvetica",
|
||||
bold: bool = False,
|
||||
italic: bool = False,
|
||||
font_size: str = "14pt",
|
||||
font_color: str = "000000",
|
||||
border_color: Optional[str] = "000000",
|
||||
background_color: Optional[str] = "ffffff",
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self[NameObject("/Subtype")] = NameObject("/FreeText")
|
||||
self[NameObject("/Rect")] = RectangleObject(rect)
|
||||
|
||||
# Table 225 of the 1.7 reference ("CSS2 style attributes used in rich text strings")
|
||||
font_str = "font: "
|
||||
if italic:
|
||||
font_str = f"{font_str}italic "
|
||||
else:
|
||||
font_str = f"{font_str}normal "
|
||||
if bold:
|
||||
font_str = f"{font_str}bold "
|
||||
else:
|
||||
font_str = f"{font_str}normal "
|
||||
font_str = f"{font_str}{font_size} {font}"
|
||||
font_str = f"{font_str};text-align:left;color:#{font_color}"
|
||||
|
||||
default_appearance_string = ""
|
||||
if border_color:
|
||||
for st in hex_to_rgb(border_color):
|
||||
default_appearance_string = f"{default_appearance_string}{st} "
|
||||
default_appearance_string = f"{default_appearance_string}rg"
|
||||
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Subtype"): NameObject("/FreeText"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
NameObject("/Contents"): TextStringObject(text),
|
||||
# font size color
|
||||
NameObject("/DS"): TextStringObject(font_str),
|
||||
NameObject("/DA"): TextStringObject(default_appearance_string),
|
||||
}
|
||||
)
|
||||
if border_color is None:
|
||||
# Border Style
|
||||
self[NameObject("/BS")] = DictionaryObject(
|
||||
{
|
||||
# width of 0 means no border
|
||||
NameObject("/W"): NumberObject(0)
|
||||
}
|
||||
)
|
||||
if background_color is not None:
|
||||
self[NameObject("/C")] = ArrayObject(
|
||||
[FloatObject(n) for n in hex_to_rgb(background_color)]
|
||||
)
|
||||
|
||||
|
||||
class Line(MarkupAnnotation):
|
||||
def __init__(
|
||||
self,
|
||||
p1: Vertex,
|
||||
p2: Vertex,
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
text: str = "",
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Subtype"): NameObject("/Line"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
NameObject("/L"): ArrayObject(
|
||||
[
|
||||
FloatObject(p1[0]),
|
||||
FloatObject(p1[1]),
|
||||
FloatObject(p2[0]),
|
||||
FloatObject(p2[1]),
|
||||
]
|
||||
),
|
||||
NameObject("/LE"): ArrayObject(
|
||||
[
|
||||
NameObject("/None"),
|
||||
NameObject("/None"),
|
||||
]
|
||||
),
|
||||
NameObject("/IC"): ArrayObject(
|
||||
[
|
||||
FloatObject(0.5),
|
||||
FloatObject(0.5),
|
||||
FloatObject(0.5),
|
||||
]
|
||||
),
|
||||
NameObject("/Contents"): TextStringObject(text),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class PolyLine(MarkupAnnotation):
|
||||
def __init__(
|
||||
self,
|
||||
vertices: List[Vertex],
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
if len(vertices) == 0:
|
||||
raise ValueError("A polygon needs at least 1 vertex with two coordinates")
|
||||
coord_list = []
|
||||
for x, y in vertices:
|
||||
coord_list.append(NumberObject(x))
|
||||
coord_list.append(NumberObject(y))
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Subtype"): NameObject("/PolyLine"),
|
||||
NameObject("/Vertices"): ArrayObject(coord_list),
|
||||
NameObject("/Rect"): RectangleObject(_get_bounding_rectangle(vertices)),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class Rectangle(MarkupAnnotation):
|
||||
def __init__(
|
||||
self,
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
*,
|
||||
interior_color: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
if "interiour_color" in kwargs:
|
||||
deprecation_with_replacement("interiour_color", "interior_color", "5.0.0")
|
||||
interior_color = kwargs["interiour_color"]
|
||||
del kwargs["interiour_color"]
|
||||
super().__init__(**kwargs)
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Type"): NameObject("/Annot"),
|
||||
NameObject("/Subtype"): NameObject("/Square"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
}
|
||||
)
|
||||
|
||||
if interior_color:
|
||||
self[NameObject("/IC")] = ArrayObject(
|
||||
[FloatObject(n) for n in hex_to_rgb(interior_color)]
|
||||
)
|
||||
|
||||
|
||||
class Highlight(MarkupAnnotation):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
quad_points: ArrayObject,
|
||||
highlight_color: str = "ff0000",
|
||||
printing: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Subtype"): NameObject("/Highlight"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
NameObject("/QuadPoints"): quad_points,
|
||||
NameObject("/C"): ArrayObject(
|
||||
[FloatObject(n) for n in hex_to_rgb(highlight_color)]
|
||||
),
|
||||
}
|
||||
)
|
||||
if printing:
|
||||
self.flags = AnnotationFlag.PRINT
|
||||
|
||||
|
||||
class Ellipse(MarkupAnnotation):
|
||||
def __init__(
|
||||
self,
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
*,
|
||||
interior_color: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
if "interiour_color" in kwargs:
|
||||
deprecation_with_replacement("interiour_color", "interior_color", "5.0.0")
|
||||
interior_color = kwargs["interiour_color"]
|
||||
del kwargs["interiour_color"]
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Type"): NameObject("/Annot"),
|
||||
NameObject("/Subtype"): NameObject("/Circle"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
}
|
||||
)
|
||||
|
||||
if interior_color:
|
||||
self[NameObject("/IC")] = ArrayObject(
|
||||
[FloatObject(n) for n in hex_to_rgb(interior_color)]
|
||||
)
|
||||
|
||||
|
||||
class Polygon(MarkupAnnotation):
|
||||
def __init__(
|
||||
self,
|
||||
vertices: List[Tuple[float, float]],
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
if len(vertices) == 0:
|
||||
raise ValueError("A polygon needs at least 1 vertex with two coordinates")
|
||||
|
||||
coord_list = []
|
||||
for x, y in vertices:
|
||||
coord_list.append(NumberObject(x))
|
||||
coord_list.append(NumberObject(y))
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Type"): NameObject("/Annot"),
|
||||
NameObject("/Subtype"): NameObject("/Polygon"),
|
||||
NameObject("/Vertices"): ArrayObject(coord_list),
|
||||
NameObject("/IT"): NameObject("/PolygonCloud"),
|
||||
NameObject("/Rect"): RectangleObject(_get_bounding_rectangle(vertices)),
|
||||
}
|
||||
)
|
||||
@ -0,0 +1,106 @@
|
||||
from typing import TYPE_CHECKING, Any, Optional, Tuple, Union
|
||||
|
||||
from ..generic._base import (
|
||||
BooleanObject,
|
||||
NameObject,
|
||||
NumberObject,
|
||||
TextStringObject,
|
||||
)
|
||||
from ..generic._data_structures import ArrayObject, DictionaryObject
|
||||
from ..generic._fit import DEFAULT_FIT, Fit
|
||||
from ..generic._rectangle import RectangleObject
|
||||
from ._base import AnnotationDictionary
|
||||
|
||||
|
||||
class Link(AnnotationDictionary):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
border: Optional[ArrayObject] = None,
|
||||
url: Optional[str] = None,
|
||||
target_page_index: Optional[int] = None,
|
||||
fit: Fit = DEFAULT_FIT,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
if TYPE_CHECKING:
|
||||
from ..types import BorderArrayType
|
||||
|
||||
is_external = url is not None
|
||||
is_internal = target_page_index is not None
|
||||
if not is_external and not is_internal:
|
||||
raise ValueError(
|
||||
"Either 'url' or 'target_page_index' have to be provided. Both were None."
|
||||
)
|
||||
if is_external and is_internal:
|
||||
raise ValueError(
|
||||
"Either 'url' or 'target_page_index' have to be provided. "
|
||||
f"{url=}, {target_page_index=}"
|
||||
)
|
||||
|
||||
border_arr: BorderArrayType
|
||||
if border is not None:
|
||||
border_arr = [NumberObject(n) for n in border[:3]]
|
||||
if len(border) == 4:
|
||||
dash_pattern = ArrayObject([NumberObject(n) for n in border[3]])
|
||||
border_arr.append(dash_pattern)
|
||||
else:
|
||||
border_arr = [NumberObject(0)] * 3
|
||||
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Type"): NameObject("/Annot"),
|
||||
NameObject("/Subtype"): NameObject("/Link"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
NameObject("/Border"): ArrayObject(border_arr),
|
||||
}
|
||||
)
|
||||
if is_external:
|
||||
self[NameObject("/A")] = DictionaryObject(
|
||||
{
|
||||
NameObject("/S"): NameObject("/URI"),
|
||||
NameObject("/Type"): NameObject("/Action"),
|
||||
NameObject("/URI"): TextStringObject(url),
|
||||
}
|
||||
)
|
||||
if is_internal:
|
||||
# This needs to be updated later!
|
||||
dest_deferred = DictionaryObject(
|
||||
{
|
||||
"target_page_index": NumberObject(target_page_index),
|
||||
"fit": NameObject(fit.fit_type),
|
||||
"fit_args": fit.fit_args,
|
||||
}
|
||||
)
|
||||
self[NameObject("/Dest")] = dest_deferred
|
||||
|
||||
|
||||
class Popup(AnnotationDictionary):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
parent: Optional[DictionaryObject] = None,
|
||||
open: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.update(
|
||||
{
|
||||
NameObject("/Subtype"): NameObject("/Popup"),
|
||||
NameObject("/Rect"): RectangleObject(rect),
|
||||
NameObject("/Open"): BooleanObject(open),
|
||||
}
|
||||
)
|
||||
if parent:
|
||||
# This needs to be an indirect object
|
||||
try:
|
||||
self[NameObject("/Parent")] = parent.indirect_reference
|
||||
except AttributeError:
|
||||
from .._utils import logger_warning
|
||||
|
||||
logger_warning(
|
||||
"Unregistered Parent object : No Parent field set",
|
||||
__name__,
|
||||
)
|
||||
722
venv/lib/python3.12/site-packages/pypdf/constants.py
Normal file
722
venv/lib/python3.12/site-packages/pypdf/constants.py
Normal file
@ -0,0 +1,722 @@
|
||||
"""Various constants, enums, and flags to aid readability."""
|
||||
|
||||
from enum import Enum, IntFlag, auto, unique
|
||||
from typing import Dict, Tuple
|
||||
|
||||
|
||||
class StrEnum(str, Enum): # Once we are on Python 3.11+: enum.StrEnum
|
||||
def __str__(self) -> str:
|
||||
return str(self.value)
|
||||
|
||||
|
||||
class Core:
|
||||
"""Keywords that don't quite belong anywhere else."""
|
||||
|
||||
OUTLINES = "/Outlines"
|
||||
THREADS = "/Threads"
|
||||
PAGE = "/Page"
|
||||
PAGES = "/Pages"
|
||||
CATALOG = "/Catalog"
|
||||
|
||||
|
||||
class TrailerKeys:
|
||||
ROOT = "/Root"
|
||||
ENCRYPT = "/Encrypt"
|
||||
ID = "/ID"
|
||||
INFO = "/Info"
|
||||
SIZE = "/Size"
|
||||
PREV = "/Prev"
|
||||
|
||||
|
||||
class CatalogAttributes:
|
||||
NAMES = "/Names"
|
||||
DESTS = "/Dests"
|
||||
|
||||
|
||||
class EncryptionDictAttributes:
|
||||
"""
|
||||
Additional encryption dictionary entries for the standard security handler.
|
||||
|
||||
Table 3.19, Page 122.
|
||||
Table 21 of the 2.0 manual.
|
||||
"""
|
||||
|
||||
R = "/R" # number, required; revision of the standard security handler
|
||||
O = "/O" # 32-byte string, required # noqa: E741
|
||||
U = "/U" # 32-byte string, required
|
||||
P = "/P" # integer flag, required; permitted operations
|
||||
ENCRYPT_METADATA = "/EncryptMetadata" # boolean flag, optional
|
||||
|
||||
|
||||
class UserAccessPermissions(IntFlag):
|
||||
"""
|
||||
Table 3.20 User access permissions.
|
||||
Table 22 of the 2.0 manual.
|
||||
"""
|
||||
|
||||
R1 = 1
|
||||
R2 = 2
|
||||
PRINT = 4
|
||||
MODIFY = 8
|
||||
EXTRACT = 16
|
||||
ADD_OR_MODIFY = 32
|
||||
R7 = 64
|
||||
R8 = 128
|
||||
FILL_FORM_FIELDS = 256
|
||||
EXTRACT_TEXT_AND_GRAPHICS = 512
|
||||
ASSEMBLE_DOC = 1024
|
||||
PRINT_TO_REPRESENTATION = 2048
|
||||
R13 = 2**12
|
||||
R14 = 2**13
|
||||
R15 = 2**14
|
||||
R16 = 2**15
|
||||
R17 = 2**16
|
||||
R18 = 2**17
|
||||
R19 = 2**18
|
||||
R20 = 2**19
|
||||
R21 = 2**20
|
||||
R22 = 2**21
|
||||
R23 = 2**22
|
||||
R24 = 2**23
|
||||
R25 = 2**24
|
||||
R26 = 2**25
|
||||
R27 = 2**26
|
||||
R28 = 2**27
|
||||
R29 = 2**28
|
||||
R30 = 2**29
|
||||
R31 = 2**30
|
||||
R32 = 2**31
|
||||
|
||||
@classmethod
|
||||
def _is_reserved(cls, name: str) -> bool:
|
||||
"""Check if the given name corresponds to a reserved flag entry."""
|
||||
return name.startswith("R") and name[1:].isdigit()
|
||||
|
||||
@classmethod
|
||||
def _is_active(cls, name: str) -> bool:
|
||||
"""Check if the given reserved name defaults to 1 = active."""
|
||||
return name not in {"R1", "R2"}
|
||||
|
||||
def to_dict(self) -> Dict[str, bool]:
|
||||
"""Convert the given flag value to a corresponding verbose name mapping."""
|
||||
result: Dict[str, bool] = {}
|
||||
for name, flag in UserAccessPermissions.__members__.items():
|
||||
if UserAccessPermissions._is_reserved(name):
|
||||
continue
|
||||
result[name.lower()] = (self & flag) == flag
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, value: Dict[str, bool]) -> "UserAccessPermissions":
|
||||
"""Convert the verbose name mapping to the corresponding flag value."""
|
||||
value_copy = value.copy()
|
||||
result = cls(0)
|
||||
for name, flag in cls.__members__.items():
|
||||
if cls._is_reserved(name):
|
||||
# Reserved names have a required value. Use it.
|
||||
if cls._is_active(name):
|
||||
result |= flag
|
||||
continue
|
||||
is_active = value_copy.pop(name.lower(), False)
|
||||
if is_active:
|
||||
result |= flag
|
||||
if value_copy:
|
||||
raise ValueError(f"Unknown dictionary keys: {value_copy!r}")
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def all(cls) -> "UserAccessPermissions":
|
||||
return cls((2**32 - 1) - cls.R1 - cls.R2)
|
||||
|
||||
|
||||
class Resources:
|
||||
"""
|
||||
Table 3.30 Entries in a resource dictionary.
|
||||
Table 34 in the 2.0 reference.
|
||||
"""
|
||||
|
||||
EXT_G_STATE = "/ExtGState" # dictionary, optional
|
||||
COLOR_SPACE = "/ColorSpace" # dictionary, optional
|
||||
PATTERN = "/Pattern" # dictionary, optional
|
||||
SHADING = "/Shading" # dictionary, optional
|
||||
XOBJECT = "/XObject" # dictionary, optional
|
||||
FONT = "/Font" # dictionary, optional
|
||||
PROC_SET = "/ProcSet" # array, optional
|
||||
PROPERTIES = "/Properties" # dictionary, optional
|
||||
|
||||
|
||||
class Ressources: # deprecated
|
||||
"""
|
||||
Use :class: `Resources` instead.
|
||||
|
||||
.. deprecated:: 5.0.0
|
||||
"""
|
||||
|
||||
|
||||
class PagesAttributes:
|
||||
"""§7.7.3.2 of the 1.7 and 2.0 reference."""
|
||||
|
||||
TYPE = "/Type" # name, required; must be /Pages
|
||||
PARENT = "/Parent" # dictionary, required; indirect reference to pages object
|
||||
KIDS = "/Kids" # array, required; List of indirect references
|
||||
COUNT = "/Count"
|
||||
# integer, required; the number of leaf nodes (page objects)
|
||||
# that are descendants of this node within the page tree
|
||||
|
||||
|
||||
class PageAttributes:
|
||||
"""§7.7.3.3 of the 1.7 and 2.0 reference."""
|
||||
|
||||
TYPE = "/Type" # name, required; must be /Page
|
||||
PARENT = "/Parent" # dictionary, required; a pages object
|
||||
LAST_MODIFIED = (
|
||||
"/LastModified" # date, optional; date and time of last modification
|
||||
)
|
||||
RESOURCES = "/Resources" # dictionary, required if there are any
|
||||
MEDIABOX = "/MediaBox" # rectangle, required; rectangle specifying page size
|
||||
CROPBOX = "/CropBox" # rectangle, optional
|
||||
BLEEDBOX = "/BleedBox" # rectangle, optional
|
||||
TRIMBOX = "/TrimBox" # rectangle, optional
|
||||
ARTBOX = "/ArtBox" # rectangle, optional
|
||||
BOX_COLOR_INFO = "/BoxColorInfo" # dictionary, optional
|
||||
CONTENTS = "/Contents" # stream or array, optional
|
||||
ROTATE = "/Rotate" # integer, optional; page rotation in degrees
|
||||
GROUP = "/Group" # dictionary, optional; page group
|
||||
THUMB = "/Thumb" # stream, optional; indirect reference to image of the page
|
||||
B = "/B" # array, optional
|
||||
DUR = "/Dur" # number, optional
|
||||
TRANS = "/Trans" # dictionary, optional
|
||||
ANNOTS = "/Annots" # array, optional; an array of annotations
|
||||
AA = "/AA" # dictionary, optional
|
||||
METADATA = "/Metadata" # stream, optional
|
||||
PIECE_INFO = "/PieceInfo" # dictionary, optional
|
||||
STRUCT_PARENTS = "/StructParents" # integer, optional
|
||||
ID = "/ID" # byte string, optional
|
||||
PZ = "/PZ" # number, optional
|
||||
SEPARATION_INFO = "/SeparationInfo" # dictionary, optional
|
||||
TABS = "/Tabs" # name, optional
|
||||
TEMPLATE_INSTANTIATED = "/TemplateInstantiated" # name, optional
|
||||
PRES_STEPS = "/PresSteps" # dictionary, optional
|
||||
USER_UNIT = "/UserUnit" # number, optional
|
||||
VP = "/VP" # dictionary, optional
|
||||
AF = "/AF" # array of dictionaries, optional
|
||||
OUTPUT_INTENTS = "/OutputIntents" # array, optional
|
||||
D_PART = "/DPart" # dictionary, required, if this page is within the range of a DPart, not permitted otherwise
|
||||
|
||||
|
||||
class FileSpecificationDictionaryEntries:
|
||||
"""Table 3.41 Entries in a file specification dictionary."""
|
||||
|
||||
Type = "/Type"
|
||||
FS = "/FS" # The name of the file system to be used to interpret this file specification
|
||||
F = "/F" # A file specification string of the form described in §3.10.1
|
||||
UF = "/UF" # A Unicode string of the file as described in §3.10.1
|
||||
DOS = "/DOS"
|
||||
Mac = "/Mac"
|
||||
Unix = "/Unix"
|
||||
ID = "/ID"
|
||||
V = "/V"
|
||||
EF = "/EF" # dictionary, containing a subset of the keys F, UF, DOS, Mac, and Unix
|
||||
RF = "/RF" # dictionary, containing arrays of /EmbeddedFile
|
||||
DESC = "/Desc" # description of the file
|
||||
Cl = "/Cl"
|
||||
|
||||
|
||||
class StreamAttributes:
|
||||
"""
|
||||
Table 4.2.
|
||||
Table 5 in the 2.0 reference.
|
||||
"""
|
||||
|
||||
LENGTH = "/Length" # integer, required
|
||||
FILTER = "/Filter" # name or array of names, optional
|
||||
DECODE_PARMS = "/DecodeParms" # variable, optional -- 'decodeParams is wrong
|
||||
|
||||
|
||||
@unique
|
||||
class FilterTypes(StrEnum):
|
||||
"""§7.4 of the 1.7 and 2.0 references."""
|
||||
|
||||
ASCII_HEX_DECODE = "/ASCIIHexDecode" # abbreviation: AHx
|
||||
ASCII_85_DECODE = "/ASCII85Decode" # abbreviation: A85
|
||||
LZW_DECODE = "/LZWDecode" # abbreviation: LZW
|
||||
FLATE_DECODE = "/FlateDecode" # abbreviation: Fl, PDF 1.2
|
||||
RUN_LENGTH_DECODE = "/RunLengthDecode" # abbreviation: RL
|
||||
CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF
|
||||
DCT_DECODE = "/DCTDecode" # abbreviation: DCT
|
||||
JPX_DECODE = "/JPXDecode"
|
||||
|
||||
|
||||
class FilterTypeAbbreviations:
|
||||
"""§8.9.7 of the 1.7 and 2.0 references."""
|
||||
|
||||
AHx = "/AHx"
|
||||
A85 = "/A85"
|
||||
LZW = "/LZW"
|
||||
FL = "/Fl" # FlateDecode
|
||||
RL = "/RL"
|
||||
CCF = "/CCF"
|
||||
DCT = "/DCT"
|
||||
|
||||
|
||||
class LzwFilterParameters:
|
||||
"""
|
||||
Table 4.4.
|
||||
Table 8 in the 2.0 reference.
|
||||
"""
|
||||
|
||||
PREDICTOR = "/Predictor" # integer
|
||||
COLORS = "/Colors" # integer
|
||||
BITS_PER_COMPONENT = "/BitsPerComponent" # integer
|
||||
COLUMNS = "/Columns" # integer
|
||||
EARLY_CHANGE = "/EarlyChange" # integer
|
||||
|
||||
|
||||
class CcittFaxDecodeParameters:
|
||||
"""
|
||||
Table 4.5.
|
||||
Table 11 in the 2.0 reference.
|
||||
"""
|
||||
|
||||
K = "/K" # integer
|
||||
END_OF_LINE = "/EndOfLine" # boolean
|
||||
ENCODED_BYTE_ALIGN = "/EncodedByteAlign" # boolean
|
||||
COLUMNS = "/Columns" # integer
|
||||
ROWS = "/Rows" # integer
|
||||
END_OF_BLOCK = "/EndOfBlock" # boolean
|
||||
BLACK_IS_1 = "/BlackIs1" # boolean
|
||||
DAMAGED_ROWS_BEFORE_ERROR = "/DamagedRowsBeforeError" # integer
|
||||
|
||||
|
||||
class ImageAttributes:
|
||||
"""§11.6.5 of the 1.7 and 2.0 references."""
|
||||
|
||||
TYPE = "/Type" # name, required; must be /XObject
|
||||
SUBTYPE = "/Subtype" # name, required; must be /Image
|
||||
NAME = "/Name" # name, required
|
||||
WIDTH = "/Width" # integer, required
|
||||
HEIGHT = "/Height" # integer, required
|
||||
BITS_PER_COMPONENT = "/BitsPerComponent" # integer, required
|
||||
COLOR_SPACE = "/ColorSpace" # name, required
|
||||
DECODE = "/Decode" # array, optional
|
||||
INTENT = "/Intent" # string, optional
|
||||
INTERPOLATE = "/Interpolate" # boolean, optional
|
||||
IMAGE_MASK = "/ImageMask" # boolean, optional
|
||||
MASK = "/Mask" # 1-bit image mask stream
|
||||
S_MASK = "/SMask" # dictionary or name, optional
|
||||
|
||||
|
||||
class ColorSpaces:
|
||||
DEVICE_RGB = "/DeviceRGB"
|
||||
DEVICE_CMYK = "/DeviceCMYK"
|
||||
DEVICE_GRAY = "/DeviceGray"
|
||||
|
||||
|
||||
class TypArguments:
|
||||
"""Table 8.2 of the PDF 1.7 reference."""
|
||||
|
||||
LEFT = "/Left"
|
||||
RIGHT = "/Right"
|
||||
BOTTOM = "/Bottom"
|
||||
TOP = "/Top"
|
||||
|
||||
|
||||
class TypFitArguments:
|
||||
"""Table 8.2 of the PDF 1.7 reference."""
|
||||
|
||||
FIT = "/Fit"
|
||||
FIT_V = "/FitV"
|
||||
FIT_BV = "/FitBV"
|
||||
FIT_B = "/FitB"
|
||||
FIT_H = "/FitH"
|
||||
FIT_BH = "/FitBH"
|
||||
FIT_R = "/FitR"
|
||||
XYZ = "/XYZ"
|
||||
|
||||
|
||||
class GoToActionArguments:
|
||||
S = "/S" # name, required: type of action
|
||||
D = "/D" # name / byte string /array, required: Destination to jump to
|
||||
|
||||
|
||||
class AnnotationDictionaryAttributes:
|
||||
"""Table 8.15 Entries common to all annotation dictionaries."""
|
||||
|
||||
Type = "/Type"
|
||||
Subtype = "/Subtype"
|
||||
Rect = "/Rect"
|
||||
Contents = "/Contents"
|
||||
P = "/P"
|
||||
NM = "/NM"
|
||||
M = "/M"
|
||||
F = "/F"
|
||||
AP = "/AP"
|
||||
AS = "/AS"
|
||||
DA = "/DA"
|
||||
Border = "/Border"
|
||||
C = "/C"
|
||||
StructParent = "/StructParent"
|
||||
OC = "/OC"
|
||||
|
||||
|
||||
class InteractiveFormDictEntries:
|
||||
Fields = "/Fields"
|
||||
NeedAppearances = "/NeedAppearances"
|
||||
SigFlags = "/SigFlags"
|
||||
CO = "/CO"
|
||||
DR = "/DR"
|
||||
DA = "/DA"
|
||||
Q = "/Q"
|
||||
XFA = "/XFA"
|
||||
|
||||
|
||||
class FieldDictionaryAttributes:
|
||||
"""
|
||||
Entries common to all field dictionaries (Table 8.69 PDF 1.7 reference)
|
||||
(*very partially documented here*).
|
||||
|
||||
FFBits provides the constants used for `/Ff` from Table 8.70/8.75/8.77/8.79
|
||||
"""
|
||||
|
||||
FT = "/FT" # name, required for terminal fields
|
||||
Parent = "/Parent" # dictionary, required for children
|
||||
Kids = "/Kids" # array, sometimes required
|
||||
T = "/T" # text string, optional
|
||||
TU = "/TU" # text string, optional
|
||||
TM = "/TM" # text string, optional
|
||||
Ff = "/Ff" # integer, optional
|
||||
V = "/V" # text string or array, optional
|
||||
DV = "/DV" # text string, optional
|
||||
AA = "/AA" # dictionary, optional
|
||||
Opt = "/Opt" # array, optional
|
||||
|
||||
class FfBits(IntFlag):
|
||||
"""
|
||||
Ease building /Ff flags
|
||||
Some entries may be specific to:
|
||||
|
||||
* Text (Tx) (Table 8.75 PDF 1.7 reference)
|
||||
* Buttons (Btn) (Table 8.77 PDF 1.7 reference)
|
||||
* Choice (Ch) (Table 8.79 PDF 1.7 reference)
|
||||
"""
|
||||
|
||||
ReadOnly = 1 << 0
|
||||
"""common to Tx/Btn/Ch in Table 8.70"""
|
||||
Required = 1 << 1
|
||||
"""common to Tx/Btn/Ch in Table 8.70"""
|
||||
NoExport = 1 << 2
|
||||
"""common to Tx/Btn/Ch in Table 8.70"""
|
||||
|
||||
Multiline = 1 << 12
|
||||
"""Tx"""
|
||||
Password = 1 << 13
|
||||
"""Tx"""
|
||||
|
||||
NoToggleToOff = 1 << 14
|
||||
"""Btn"""
|
||||
Radio = 1 << 15
|
||||
"""Btn"""
|
||||
Pushbutton = 1 << 16
|
||||
"""Btn"""
|
||||
|
||||
Combo = 1 << 17
|
||||
"""Ch"""
|
||||
Edit = 1 << 18
|
||||
"""Ch"""
|
||||
Sort = 1 << 19
|
||||
"""Ch"""
|
||||
|
||||
FileSelect = 1 << 20
|
||||
"""Tx"""
|
||||
|
||||
MultiSelect = 1 << 21
|
||||
"""Tx"""
|
||||
|
||||
DoNotSpellCheck = 1 << 22
|
||||
"""Tx/Ch"""
|
||||
DoNotScroll = 1 << 23
|
||||
"""Tx"""
|
||||
Comb = 1 << 24
|
||||
"""Tx"""
|
||||
|
||||
RadiosInUnison = 1 << 25
|
||||
"""Btn"""
|
||||
|
||||
RichText = 1 << 25
|
||||
"""Tx"""
|
||||
|
||||
CommitOnSelChange = 1 << 26
|
||||
"""Ch"""
|
||||
|
||||
@classmethod
|
||||
def attributes(cls) -> Tuple[str, ...]:
|
||||
"""
|
||||
Get a tuple of all the attributes present in a Field Dictionary.
|
||||
|
||||
This method returns a tuple of all the attribute constants defined in
|
||||
the FieldDictionaryAttributes class. These attributes correspond to the
|
||||
entries that are common to all field dictionaries as specified in the
|
||||
PDF 1.7 reference.
|
||||
|
||||
Returns:
|
||||
A tuple containing all the attribute constants.
|
||||
|
||||
"""
|
||||
return (
|
||||
cls.TM,
|
||||
cls.T,
|
||||
cls.FT,
|
||||
cls.Parent,
|
||||
cls.TU,
|
||||
cls.Ff,
|
||||
cls.V,
|
||||
cls.DV,
|
||||
cls.Kids,
|
||||
cls.AA,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def attributes_dict(cls) -> Dict[str, str]:
|
||||
"""
|
||||
Get a dictionary of attribute keys and their human-readable names.
|
||||
|
||||
This method returns a dictionary where the keys are the attribute
|
||||
constants defined in the FieldDictionaryAttributes class and the values
|
||||
are their corresponding human-readable names. These attributes
|
||||
correspond to the entries that are common to all field dictionaries as
|
||||
specified in the PDF 1.7 reference.
|
||||
|
||||
Returns:
|
||||
A dictionary containing attribute keys and their names.
|
||||
|
||||
"""
|
||||
return {
|
||||
cls.FT: "Field Type",
|
||||
cls.Parent: "Parent",
|
||||
cls.T: "Field Name",
|
||||
cls.TU: "Alternate Field Name",
|
||||
cls.TM: "Mapping Name",
|
||||
cls.Ff: "Field Flags",
|
||||
cls.V: "Value",
|
||||
cls.DV: "Default Value",
|
||||
}
|
||||
|
||||
|
||||
class CheckboxRadioButtonAttributes:
|
||||
"""Table 8.76 Field flags common to all field types."""
|
||||
|
||||
Opt = "/Opt" # Options, Optional
|
||||
|
||||
@classmethod
|
||||
def attributes(cls) -> Tuple[str, ...]:
|
||||
"""
|
||||
Get a tuple of all the attributes present in a Field Dictionary.
|
||||
|
||||
This method returns a tuple of all the attribute constants defined in
|
||||
the CheckboxRadioButtonAttributes class. These attributes correspond to
|
||||
the entries that are common to all field dictionaries as specified in
|
||||
the PDF 1.7 reference.
|
||||
|
||||
Returns:
|
||||
A tuple containing all the attribute constants.
|
||||
|
||||
"""
|
||||
return (cls.Opt,)
|
||||
|
||||
@classmethod
|
||||
def attributes_dict(cls) -> Dict[str, str]:
|
||||
"""
|
||||
Get a dictionary of attribute keys and their human-readable names.
|
||||
|
||||
This method returns a dictionary where the keys are the attribute
|
||||
constants defined in the CheckboxRadioButtonAttributes class and the
|
||||
values are their corresponding human-readable names. These attributes
|
||||
correspond to the entries that are common to all field dictionaries as
|
||||
specified in the PDF 1.7 reference.
|
||||
|
||||
Returns:
|
||||
A dictionary containing attribute keys and their names.
|
||||
|
||||
"""
|
||||
return {
|
||||
cls.Opt: "Options",
|
||||
}
|
||||
|
||||
|
||||
class FieldFlag(IntFlag):
|
||||
"""Table 8.70 Field flags common to all field types."""
|
||||
|
||||
READ_ONLY = 1
|
||||
REQUIRED = 2
|
||||
NO_EXPORT = 4
|
||||
|
||||
|
||||
class DocumentInformationAttributes:
|
||||
"""Table 10.2 Entries in the document information dictionary."""
|
||||
|
||||
TITLE = "/Title" # text string, optional
|
||||
AUTHOR = "/Author" # text string, optional
|
||||
SUBJECT = "/Subject" # text string, optional
|
||||
KEYWORDS = "/Keywords" # text string, optional
|
||||
CREATOR = "/Creator" # text string, optional
|
||||
PRODUCER = "/Producer" # text string, optional
|
||||
CREATION_DATE = "/CreationDate" # date, optional
|
||||
MOD_DATE = "/ModDate" # date, optional
|
||||
TRAPPED = "/Trapped" # name, optional
|
||||
|
||||
|
||||
class PageLayouts:
|
||||
"""
|
||||
Page 84, PDF 1.4 reference.
|
||||
Page 115, PDF 2.0 reference.
|
||||
"""
|
||||
|
||||
SINGLE_PAGE = "/SinglePage"
|
||||
ONE_COLUMN = "/OneColumn"
|
||||
TWO_COLUMN_LEFT = "/TwoColumnLeft"
|
||||
TWO_COLUMN_RIGHT = "/TwoColumnRight"
|
||||
TWO_PAGE_LEFT = "/TwoPageLeft" # (PDF 1.5)
|
||||
TWO_PAGE_RIGHT = "/TwoPageRight" # (PDF 1.5)
|
||||
|
||||
|
||||
class GraphicsStateParameters:
|
||||
"""Table 58 – Entries in a Graphics State Parameter Dictionary"""
|
||||
|
||||
TYPE = "/Type" # name, optional
|
||||
LW = "/LW" # number, optional
|
||||
LC = "/LC" # integer, optional
|
||||
LJ = "/LJ" # integer, optional
|
||||
ML = "/ML" # number, optional
|
||||
D = "/D" # array, optional
|
||||
RI = "/RI" # name, optional
|
||||
OP = "/OP"
|
||||
op = "/op"
|
||||
OPM = "/OPM"
|
||||
FONT = "/Font" # array, optional
|
||||
BG = "/BG"
|
||||
BG2 = "/BG2"
|
||||
UCR = "/UCR"
|
||||
UCR2 = "/UCR2"
|
||||
TR = "/TR"
|
||||
TR2 = "/TR2"
|
||||
HT = "/HT"
|
||||
FL = "/FL"
|
||||
SM = "/SM"
|
||||
SA = "/SA"
|
||||
BM = "/BM"
|
||||
S_MASK = "/SMask" # dictionary or name, optional
|
||||
CA = "/CA"
|
||||
ca = "/ca"
|
||||
AIS = "/AIS"
|
||||
TK = "/TK"
|
||||
|
||||
|
||||
class CatalogDictionary:
|
||||
"""§7.7.2 of the 1.7 and 2.0 references."""
|
||||
|
||||
TYPE = "/Type" # name, required; must be /Catalog
|
||||
VERSION = "/Version" # name
|
||||
EXTENSIONS = "/Extensions" # dictionary, optional; ISO 32000-1
|
||||
PAGES = "/Pages" # dictionary, required
|
||||
PAGE_LABELS = "/PageLabels" # number tree, optional
|
||||
NAMES = "/Names" # dictionary, optional
|
||||
DESTS = "/Dests" # dictionary, optional
|
||||
VIEWER_PREFERENCES = "/ViewerPreferences" # dictionary, optional
|
||||
PAGE_LAYOUT = "/PageLayout" # name, optional
|
||||
PAGE_MODE = "/PageMode" # name, optional
|
||||
OUTLINES = "/Outlines" # dictionary, optional
|
||||
THREADS = "/Threads" # array, optional
|
||||
OPEN_ACTION = "/OpenAction" # array or dictionary or name, optional
|
||||
AA = "/AA" # dictionary, optional
|
||||
URI = "/URI" # dictionary, optional
|
||||
ACRO_FORM = "/AcroForm" # dictionary, optional
|
||||
METADATA = "/Metadata" # stream, optional
|
||||
STRUCT_TREE_ROOT = "/StructTreeRoot" # dictionary, optional
|
||||
MARK_INFO = "/MarkInfo" # dictionary, optional
|
||||
LANG = "/Lang" # text string, optional
|
||||
SPIDER_INFO = "/SpiderInfo" # dictionary, optional
|
||||
OUTPUT_INTENTS = "/OutputIntents" # array, optional
|
||||
PIECE_INFO = "/PieceInfo" # dictionary, optional
|
||||
OC_PROPERTIES = "/OCProperties" # dictionary, optional
|
||||
PERMS = "/Perms" # dictionary, optional
|
||||
LEGAL = "/Legal" # dictionary, optional
|
||||
REQUIREMENTS = "/Requirements" # array, optional
|
||||
COLLECTION = "/Collection" # dictionary, optional
|
||||
NEEDS_RENDERING = "/NeedsRendering" # boolean, optional
|
||||
DSS = "/DSS" # dictionary, optional
|
||||
AF = "/AF" # array of dictionaries, optional
|
||||
D_PART_ROOT = "/DPartRoot" # dictionary, optional
|
||||
|
||||
|
||||
class OutlineFontFlag(IntFlag):
|
||||
"""A class used as an enumerable flag for formatting an outline font."""
|
||||
|
||||
italic = 1
|
||||
bold = 2
|
||||
|
||||
|
||||
class PageLabelStyle:
|
||||
"""
|
||||
Table 8.10 in the 1.7 reference.
|
||||
Table 161 in the 2.0 reference.
|
||||
"""
|
||||
|
||||
DECIMAL = "/D" # Decimal Arabic numerals
|
||||
UPPERCASE_ROMAN = "/R" # Uppercase Roman numerals
|
||||
LOWERCASE_ROMAN = "/r" # Lowercase Roman numerals
|
||||
UPPERCASE_LETTER = "/A" # Uppercase letters
|
||||
LOWERCASE_LETTER = "/a" # Lowercase letters
|
||||
|
||||
|
||||
class AnnotationFlag(IntFlag):
|
||||
"""See §12.5.3 "Annotation Flags"."""
|
||||
|
||||
INVISIBLE = 1
|
||||
HIDDEN = 2
|
||||
PRINT = 4
|
||||
NO_ZOOM = 8
|
||||
NO_ROTATE = 16
|
||||
NO_VIEW = 32
|
||||
READ_ONLY = 64
|
||||
LOCKED = 128
|
||||
TOGGLE_NO_VIEW = 256
|
||||
LOCKED_CONTENTS = 512
|
||||
|
||||
|
||||
PDF_KEYS = (
|
||||
AnnotationDictionaryAttributes,
|
||||
CatalogAttributes,
|
||||
CatalogDictionary,
|
||||
CcittFaxDecodeParameters,
|
||||
CheckboxRadioButtonAttributes,
|
||||
ColorSpaces,
|
||||
Core,
|
||||
DocumentInformationAttributes,
|
||||
EncryptionDictAttributes,
|
||||
FieldDictionaryAttributes,
|
||||
FileSpecificationDictionaryEntries,
|
||||
FilterTypeAbbreviations,
|
||||
FilterTypes,
|
||||
GoToActionArguments,
|
||||
GraphicsStateParameters,
|
||||
ImageAttributes,
|
||||
InteractiveFormDictEntries,
|
||||
LzwFilterParameters,
|
||||
PageAttributes,
|
||||
PageLayouts,
|
||||
PagesAttributes,
|
||||
Resources,
|
||||
StreamAttributes,
|
||||
TrailerKeys,
|
||||
TypArguments,
|
||||
TypFitArguments,
|
||||
)
|
||||
|
||||
|
||||
class ImageType(IntFlag):
|
||||
NONE = 0
|
||||
XOBJECT_IMAGES = auto()
|
||||
INLINE_IMAGES = auto()
|
||||
DRAWING_IMAGES = auto()
|
||||
ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
|
||||
IMAGES = ALL # for consistency with ObjectDeletionFlag
|
||||
66
venv/lib/python3.12/site-packages/pypdf/errors.py
Normal file
66
venv/lib/python3.12/site-packages/pypdf/errors.py
Normal file
@ -0,0 +1,66 @@
|
||||
"""
|
||||
All errors/exceptions pypdf raises and all of the warnings it uses.
|
||||
|
||||
Please note that broken PDF files might cause other Exceptions.
|
||||
"""
|
||||
|
||||
|
||||
class DeprecationError(Exception):
|
||||
"""Raised when a deprecated feature is used."""
|
||||
|
||||
|
||||
class DependencyError(Exception):
|
||||
"""
|
||||
Raised when a required dependency (a library or module that pypdf depends on)
|
||||
is not available or cannot be imported.
|
||||
"""
|
||||
|
||||
|
||||
class PyPdfError(Exception):
|
||||
"""Base class for all exceptions raised by pypdf."""
|
||||
|
||||
|
||||
class PdfReadError(PyPdfError):
|
||||
"""Raised when there is an issue reading a PDF file."""
|
||||
|
||||
|
||||
class PageSizeNotDefinedError(PyPdfError):
|
||||
"""Raised when the page size of a PDF document is not defined."""
|
||||
|
||||
|
||||
class PdfReadWarning(UserWarning):
|
||||
"""Issued when there is a potential issue reading a PDF file, but it can still be read."""
|
||||
|
||||
|
||||
class PdfStreamError(PdfReadError):
|
||||
"""Raised when there is an issue reading the stream of data in a PDF file."""
|
||||
|
||||
|
||||
class ParseError(PyPdfError):
|
||||
"""
|
||||
Raised when there is an issue parsing (analyzing and understanding the
|
||||
structure and meaning of) a PDF file.
|
||||
"""
|
||||
|
||||
|
||||
class FileNotDecryptedError(PdfReadError):
|
||||
"""
|
||||
Raised when a PDF file that has been encrypted
|
||||
(meaning it requires a password to be accessed) has not been successfully
|
||||
decrypted.
|
||||
"""
|
||||
|
||||
|
||||
class WrongPasswordError(FileNotDecryptedError):
|
||||
"""Raised when the wrong password is used to try to decrypt an encrypted PDF file."""
|
||||
|
||||
|
||||
class EmptyFileError(PdfReadError):
|
||||
"""Raised when a PDF file is empty or has no content."""
|
||||
|
||||
|
||||
class EmptyImageDataError(PyPdfError):
|
||||
"""Raised when trying to process an image that has no data."""
|
||||
|
||||
|
||||
STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"
|
||||
836
venv/lib/python3.12/site-packages/pypdf/filters.py
Normal file
836
venv/lib/python3.12/site-packages/pypdf/filters.py
Normal file
@ -0,0 +1,836 @@
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
"""
|
||||
Implementation of stream filters for PDF.
|
||||
|
||||
See TABLE H.1 Abbreviations for standard filter names
|
||||
"""
|
||||
__author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||
|
||||
import math
|
||||
import struct
|
||||
import zlib
|
||||
from base64 import a85decode
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||
|
||||
from ._codecs._codecs import LzwCodec as _LzwCodec
|
||||
from ._utils import (
|
||||
WHITESPACES_AS_BYTES,
|
||||
deprecate,
|
||||
deprecate_with_replacement,
|
||||
deprecation_no_replacement,
|
||||
logger_warning,
|
||||
)
|
||||
from .constants import CcittFaxDecodeParameters as CCITT
|
||||
from .constants import FilterTypeAbbreviations as FTA
|
||||
from .constants import FilterTypes as FT
|
||||
from .constants import ImageAttributes as IA
|
||||
from .constants import LzwFilterParameters as LZW
|
||||
from .constants import StreamAttributes as SA
|
||||
from .errors import DeprecationError, PdfReadError, PdfStreamError
|
||||
from .generic import (
|
||||
ArrayObject,
|
||||
DictionaryObject,
|
||||
IndirectObject,
|
||||
NullObject,
|
||||
)
|
||||
|
||||
|
||||
def decompress(data: bytes) -> bytes:
|
||||
"""
|
||||
Decompress the given data using zlib.
|
||||
|
||||
Attempts to decompress the input data using zlib.
|
||||
If the decompression fails due to a zlib error, it falls back
|
||||
to using a decompression object with a larger window size.
|
||||
|
||||
Args:
|
||||
data: The input data to be decompressed.
|
||||
|
||||
Returns:
|
||||
The decompressed data.
|
||||
|
||||
"""
|
||||
try:
|
||||
return zlib.decompress(data)
|
||||
except zlib.error:
|
||||
try:
|
||||
# For larger files, use decompression object to enable buffered reading
|
||||
return zlib.decompressobj().decompress(data)
|
||||
except zlib.error:
|
||||
# If still failing, then try with increased window size
|
||||
d = zlib.decompressobj(zlib.MAX_WBITS | 32)
|
||||
result_str = b""
|
||||
for b in [data[i : i + 1] for i in range(len(data))]:
|
||||
try:
|
||||
result_str += d.decompress(b)
|
||||
except zlib.error:
|
||||
pass
|
||||
return result_str
|
||||
|
||||
|
||||
class FlateDecode:
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
"""
|
||||
Decode data which is flate-encoded.
|
||||
|
||||
Args:
|
||||
data: flate-encoded data.
|
||||
decode_parms: a dictionary of values, understanding the
|
||||
"/Predictor":<int> key only
|
||||
|
||||
Returns:
|
||||
The flate-decoded data.
|
||||
|
||||
Raises:
|
||||
PdfReadError:
|
||||
|
||||
"""
|
||||
if isinstance(decode_parms, ArrayObject):
|
||||
raise DeprecationError("decode_parms as ArrayObject is deprecated")
|
||||
|
||||
str_data = decompress(data)
|
||||
predictor = 1
|
||||
|
||||
if decode_parms:
|
||||
try:
|
||||
predictor = decode_parms.get("/Predictor", 1)
|
||||
except (AttributeError, TypeError): # Type Error is NullObject
|
||||
pass # Usually an array with a null object was read
|
||||
# predictor 1 == no predictor
|
||||
if predictor != 1:
|
||||
# /Columns, the number of samples in each row, has a default value of 1;
|
||||
# §7.4.4.3, ISO 32000.
|
||||
DEFAULT_BITS_PER_COMPONENT = 8
|
||||
try:
|
||||
columns = cast(int, decode_parms[LZW.COLUMNS].get_object()) # type: ignore
|
||||
except (TypeError, KeyError):
|
||||
columns = 1
|
||||
try:
|
||||
colors = cast(int, decode_parms[LZW.COLORS].get_object()) # type: ignore
|
||||
except (TypeError, KeyError):
|
||||
colors = 1
|
||||
try:
|
||||
bits_per_component = cast(
|
||||
int,
|
||||
decode_parms[LZW.BITS_PER_COMPONENT].get_object(), # type: ignore
|
||||
)
|
||||
except (TypeError, KeyError):
|
||||
bits_per_component = DEFAULT_BITS_PER_COMPONENT
|
||||
|
||||
# PNG predictor can vary by row and so is the lead byte on each row
|
||||
rowlength = (
|
||||
math.ceil(columns * colors * bits_per_component / 8) + 1
|
||||
) # number of bytes
|
||||
|
||||
# TIFF prediction:
|
||||
if predictor == 2:
|
||||
rowlength -= 1 # remove the predictor byte
|
||||
bpp = rowlength // columns
|
||||
str_data = bytearray(str_data)
|
||||
for i in range(len(str_data)):
|
||||
if i % rowlength >= bpp:
|
||||
str_data[i] = (str_data[i] + str_data[i - bpp]) % 256
|
||||
str_data = bytes(str_data)
|
||||
# PNG prediction:
|
||||
elif 10 <= predictor <= 15:
|
||||
str_data = FlateDecode._decode_png_prediction(
|
||||
str_data, columns, rowlength
|
||||
)
|
||||
else:
|
||||
raise PdfReadError(f"Unsupported flatedecode predictor {predictor!r}")
|
||||
return str_data
|
||||
|
||||
@staticmethod
|
||||
def _decode_png_prediction(data: bytes, columns: int, rowlength: int) -> bytes:
|
||||
# PNG prediction can vary from row to row
|
||||
if len(data) % rowlength != 0:
|
||||
raise PdfReadError("Image data is not rectangular")
|
||||
output = []
|
||||
prev_rowdata = (0,) * rowlength
|
||||
bpp = (rowlength - 1) // columns # recomputed locally to not change params
|
||||
for row in range(0, len(data), rowlength):
|
||||
rowdata: List[int] = list(data[row : row + rowlength])
|
||||
filter_byte = rowdata[0]
|
||||
|
||||
if filter_byte == 0:
|
||||
# PNG None Predictor
|
||||
pass
|
||||
elif filter_byte == 1:
|
||||
# PNG Sub Predictor
|
||||
for i in range(bpp + 1, rowlength):
|
||||
rowdata[i] = (rowdata[i] + rowdata[i - bpp]) % 256
|
||||
elif filter_byte == 2:
|
||||
# PNG Up Predictor
|
||||
for i in range(1, rowlength):
|
||||
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
|
||||
elif filter_byte == 3:
|
||||
# PNG Average Predictor
|
||||
for i in range(1, bpp + 1):
|
||||
floor = prev_rowdata[i] // 2
|
||||
rowdata[i] = (rowdata[i] + floor) % 256
|
||||
for i in range(bpp + 1, rowlength):
|
||||
left = rowdata[i - bpp]
|
||||
floor = (left + prev_rowdata[i]) // 2
|
||||
rowdata[i] = (rowdata[i] + floor) % 256
|
||||
elif filter_byte == 4:
|
||||
# PNG Paeth Predictor
|
||||
for i in range(1, bpp + 1):
|
||||
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
|
||||
for i in range(bpp + 1, rowlength):
|
||||
left = rowdata[i - bpp]
|
||||
up = prev_rowdata[i]
|
||||
up_left = prev_rowdata[i - bpp]
|
||||
|
||||
p = left + up - up_left
|
||||
dist_left = abs(p - left)
|
||||
dist_up = abs(p - up)
|
||||
dist_up_left = abs(p - up_left)
|
||||
|
||||
if dist_left <= dist_up and dist_left <= dist_up_left:
|
||||
paeth = left
|
||||
elif dist_up <= dist_up_left:
|
||||
paeth = up
|
||||
else:
|
||||
paeth = up_left
|
||||
|
||||
rowdata[i] = (rowdata[i] + paeth) % 256
|
||||
else:
|
||||
raise PdfReadError(
|
||||
f"Unsupported PNG filter {filter_byte!r}"
|
||||
) # pragma: no cover
|
||||
prev_rowdata = tuple(rowdata)
|
||||
output.extend(rowdata[1:])
|
||||
return bytes(output)
|
||||
|
||||
@staticmethod
|
||||
def encode(data: bytes, level: int = -1) -> bytes:
|
||||
"""
|
||||
Compress the input data using zlib.
|
||||
|
||||
Args:
|
||||
data: The data to be compressed.
|
||||
level: See https://docs.python.org/3/library/zlib.html#zlib.compress
|
||||
|
||||
Returns:
|
||||
The compressed data.
|
||||
|
||||
"""
|
||||
return zlib.compress(data, level)
|
||||
|
||||
|
||||
class ASCIIHexDecode:
|
||||
"""
|
||||
The ASCIIHexDecode filter decodes data that has been encoded in ASCII
|
||||
hexadecimal form into a base-7 ASCII format.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: Union[str, bytes],
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
"""
|
||||
Decode an ASCII-Hex encoded data stream.
|
||||
|
||||
Args:
|
||||
data: a str sequence of hexadecimal-encoded values to be
|
||||
converted into a base-7 ASCII string
|
||||
decode_parms: a string conversion in base-7 ASCII, where each of its values
|
||||
v is such that 0 <= ord(v) <= 127.
|
||||
|
||||
Returns:
|
||||
A string conversion in base-7 ASCII, where each of its values
|
||||
v is such that 0 <= ord(v) <= 127.
|
||||
|
||||
Raises:
|
||||
PdfStreamError:
|
||||
|
||||
"""
|
||||
# decode_parms is unused here
|
||||
|
||||
if isinstance(data, str):
|
||||
data = data.encode()
|
||||
retval = b""
|
||||
hex_pair = b""
|
||||
index = 0
|
||||
while True:
|
||||
if index >= len(data):
|
||||
logger_warning(
|
||||
"missing EOD in ASCIIHexDecode, check if output is OK", __name__
|
||||
)
|
||||
break # Reached end of string even if no EOD
|
||||
char = data[index : index + 1]
|
||||
if char == b">":
|
||||
break
|
||||
if char.isspace():
|
||||
index += 1
|
||||
continue
|
||||
hex_pair += char
|
||||
if len(hex_pair) == 2:
|
||||
retval += bytes((int(hex_pair, base=16),))
|
||||
hex_pair = b""
|
||||
index += 1
|
||||
assert hex_pair == b""
|
||||
return retval
|
||||
|
||||
|
||||
class RunLengthDecode:
|
||||
"""
|
||||
The RunLengthDecode filter decodes data that has been encoded in a
|
||||
simple byte-oriented format based on run length.
|
||||
The encoded data is a sequence of runs, where each run consists of
|
||||
a length byte followed by 1 to 128 bytes of data. If the length byte is
|
||||
in the range 0 to 127,
|
||||
the following length + 1 (1 to 128) bytes are copied literally during
|
||||
decompression.
|
||||
If length is in the range 129 to 255, the following single byte is to be
|
||||
copied 257 − length (2 to 128) times during decompression. A length value
|
||||
of 128 denotes EOD.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
"""
|
||||
Decode a run length encoded data stream.
|
||||
|
||||
Args:
|
||||
data: a bytes sequence of length/data
|
||||
decode_parms: ignored.
|
||||
|
||||
Returns:
|
||||
A bytes decompressed sequence.
|
||||
|
||||
Raises:
|
||||
PdfStreamError:
|
||||
|
||||
"""
|
||||
# decode_parms is unused here
|
||||
|
||||
lst = []
|
||||
index = 0
|
||||
while True:
|
||||
if index >= len(data):
|
||||
logger_warning(
|
||||
"missing EOD in RunLengthDecode, check if output is OK", __name__
|
||||
)
|
||||
break # reach End Of String even if no EOD
|
||||
length = data[index]
|
||||
index += 1
|
||||
if length == 128:
|
||||
if index < len(data):
|
||||
raise PdfStreamError("Early EOD in RunLengthDecode")
|
||||
else:
|
||||
break
|
||||
elif length < 128:
|
||||
length += 1
|
||||
lst.append(data[index : (index + length)])
|
||||
index += length
|
||||
else: # >128
|
||||
length = 257 - length
|
||||
lst.append(bytes((data[index],)) * length)
|
||||
index += 1
|
||||
return b"".join(lst)
|
||||
|
||||
|
||||
class LZWDecode:
|
||||
class Decoder:
|
||||
STOP = 257
|
||||
CLEARDICT = 256
|
||||
|
||||
def __init__(self, data: bytes) -> None:
|
||||
self.data = data
|
||||
|
||||
def decode(self) -> bytes:
|
||||
return _LzwCodec().decode(self.data)
|
||||
|
||||
@staticmethod
|
||||
def _decodeb(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
"""
|
||||
Decode an LZW encoded data stream.
|
||||
|
||||
Args:
|
||||
data: ``bytes`` or ``str`` text to decode.
|
||||
decode_parms: a dictionary of parameter values.
|
||||
|
||||
Returns:
|
||||
decoded data.
|
||||
|
||||
"""
|
||||
# decode_parms is unused here
|
||||
return LZWDecode.Decoder(data).decode()
|
||||
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> str: # deprecated
|
||||
"""
|
||||
Decode an LZW encoded data stream.
|
||||
|
||||
Args:
|
||||
data: ``bytes`` or ``str`` text to decode.
|
||||
decode_parms: a dictionary of parameter values.
|
||||
|
||||
Returns:
|
||||
decoded data.
|
||||
|
||||
"""
|
||||
# decode_parms is unused here
|
||||
deprecate("LZWDecode.decode will return bytes instead of str in pypdf 6.0.0")
|
||||
return LZWDecode.Decoder(data).decode().decode("latin-1")
|
||||
|
||||
|
||||
class ASCII85Decode:
|
||||
"""Decodes string ASCII85-encoded data into a byte format."""
|
||||
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: Union[str, bytes],
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
"""
|
||||
Decode an Ascii85 encoded data stream.
|
||||
|
||||
Args:
|
||||
data: ``bytes`` or ``str`` text to decode.
|
||||
decode_parms: a dictionary of parameter values.
|
||||
|
||||
Returns:
|
||||
decoded data.
|
||||
|
||||
"""
|
||||
if isinstance(data, str):
|
||||
data = data.encode()
|
||||
data = data.strip(WHITESPACES_AS_BYTES)
|
||||
try:
|
||||
return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES)
|
||||
except ValueError as error:
|
||||
if error.args[0] == "Ascii85 encoded byte sequences must end with b'~>'":
|
||||
logger_warning("Ignoring missing Ascii85 end marker.", __name__)
|
||||
return a85decode(data, adobe=False, ignorechars=WHITESPACES_AS_BYTES)
|
||||
raise
|
||||
|
||||
|
||||
class DCTDecode:
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
# decode_parms is unused here
|
||||
return data
|
||||
|
||||
|
||||
class JPXDecode:
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
# decode_parms is unused here
|
||||
return data
|
||||
|
||||
|
||||
@dataclass
|
||||
class CCITTParameters:
|
||||
"""§7.4.6, optional parameters for the CCITTFaxDecode filter."""
|
||||
|
||||
K: int = 0
|
||||
columns: int = 0
|
||||
rows: int = 0
|
||||
EndOfBlock: Union[int, None] = None
|
||||
EndOfLine: Union[int, None] = None
|
||||
EncodedByteAlign: Union[int, None] = None
|
||||
DamagedRowsBeforeError: Union[int, None] = None
|
||||
|
||||
@property
|
||||
def group(self) -> int:
|
||||
if self.K < 0:
|
||||
# Pure two-dimensional encoding (Group 4)
|
||||
CCITTgroup = 4
|
||||
else:
|
||||
# K == 0: Pure one-dimensional encoding (Group 3, 1-D)
|
||||
# K > 0: Mixed one- and two-dimensional encoding (Group 3, 2-D)
|
||||
CCITTgroup = 3
|
||||
return CCITTgroup
|
||||
|
||||
|
||||
def __create_old_class_instance(
|
||||
K: int = 0,
|
||||
columns: int = 0,
|
||||
rows: int = 0
|
||||
) -> CCITTParameters:
|
||||
deprecate_with_replacement("CCITParameters", "CCITTParameters", "6.0.0")
|
||||
return CCITTParameters(K, columns, rows)
|
||||
|
||||
|
||||
# Create an alias for the old class name
|
||||
CCITParameters = __create_old_class_instance
|
||||
|
||||
|
||||
class CCITTFaxDecode:
|
||||
"""
|
||||
§7.4.6, CCITTFaxDecode filter (ISO 32000).
|
||||
|
||||
Either Group 3 or Group 4 CCITT facsimile (fax) encoding.
|
||||
CCITT encoding is bit-oriented, not byte-oriented.
|
||||
|
||||
§7.4.6, optional parameters for the CCITTFaxDecode filter.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _get_parameters(
|
||||
parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject],
|
||||
rows: Union[int, IndirectObject],
|
||||
) -> CCITTParameters:
|
||||
# §7.4.6, optional parameters for the CCITTFaxDecode filter
|
||||
k = 0
|
||||
columns = 1728
|
||||
if parameters:
|
||||
parameters_unwrapped = cast(
|
||||
Union[ArrayObject, DictionaryObject], parameters.get_object()
|
||||
)
|
||||
if isinstance(parameters_unwrapped, ArrayObject):
|
||||
for decode_parm in parameters_unwrapped:
|
||||
if CCITT.COLUMNS in decode_parm:
|
||||
columns = decode_parm[CCITT.COLUMNS].get_object()
|
||||
if CCITT.K in decode_parm:
|
||||
k = decode_parm[CCITT.K].get_object()
|
||||
else:
|
||||
if CCITT.COLUMNS in parameters_unwrapped:
|
||||
columns = parameters_unwrapped[CCITT.COLUMNS].get_object() # type: ignore
|
||||
if CCITT.K in parameters_unwrapped:
|
||||
k = parameters_unwrapped[CCITT.K].get_object() # type: ignore
|
||||
|
||||
return CCITTParameters(K=k, columns=columns, rows=int(rows))
|
||||
|
||||
@staticmethod
|
||||
def decode(
|
||||
data: bytes,
|
||||
decode_parms: Optional[DictionaryObject] = None,
|
||||
height: int = 0,
|
||||
**kwargs: Any,
|
||||
) -> bytes:
|
||||
# decode_parms is unused here
|
||||
if isinstance(decode_parms, ArrayObject): # deprecated
|
||||
deprecation_no_replacement(
|
||||
"decode_parms being an ArrayObject", removed_in="3.15.5"
|
||||
)
|
||||
params = CCITTFaxDecode._get_parameters(decode_parms, height)
|
||||
|
||||
img_size = len(data)
|
||||
tiff_header_struct = "<2shlh" + "hhll" * 8 + "h"
|
||||
tiff_header = struct.pack(
|
||||
tiff_header_struct,
|
||||
b"II", # Byte order indication: Little endian
|
||||
42, # Version number (always 42)
|
||||
8, # Offset to first IFD
|
||||
8, # Number of tags in IFD
|
||||
256,
|
||||
4,
|
||||
1,
|
||||
params.columns, # ImageWidth, LONG, 1, width
|
||||
257,
|
||||
4,
|
||||
1,
|
||||
params.rows, # ImageLength, LONG, 1, length
|
||||
258,
|
||||
3,
|
||||
1,
|
||||
1, # BitsPerSample, SHORT, 1, 1
|
||||
259,
|
||||
3,
|
||||
1,
|
||||
params.group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
|
||||
262,
|
||||
3,
|
||||
1,
|
||||
0, # Thresholding, SHORT, 1, 0 = WhiteIsZero
|
||||
273,
|
||||
4,
|
||||
1,
|
||||
struct.calcsize(
|
||||
tiff_header_struct
|
||||
), # StripOffsets, LONG, 1, length of header
|
||||
278,
|
||||
4,
|
||||
1,
|
||||
params.rows, # RowsPerStrip, LONG, 1, length
|
||||
279,
|
||||
4,
|
||||
1,
|
||||
img_size, # StripByteCounts, LONG, 1, size of image
|
||||
0, # last IFD
|
||||
)
|
||||
|
||||
return tiff_header + data
|
||||
|
||||
|
||||
def decode_stream_data(stream: Any) -> bytes:
|
||||
"""
|
||||
Decode the stream data based on the specified filters.
|
||||
|
||||
This function decodes the stream data using the filters provided in the
|
||||
stream.
|
||||
|
||||
Args:
|
||||
stream: The input stream object containing the data and filters.
|
||||
|
||||
Returns:
|
||||
The decoded stream data.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: If an unsupported filter type is encountered.
|
||||
|
||||
"""
|
||||
filters = stream.get(SA.FILTER, ())
|
||||
if isinstance(filters, IndirectObject):
|
||||
filters = cast(ArrayObject, filters.get_object())
|
||||
if not isinstance(filters, ArrayObject):
|
||||
# We have a single filter instance
|
||||
filters = (filters,)
|
||||
decode_parms = stream.get(SA.DECODE_PARMS, ({},) * len(filters))
|
||||
if not isinstance(decode_parms, (list, tuple)):
|
||||
decode_parms = (decode_parms,)
|
||||
data: bytes = stream._data
|
||||
# If there is not data to decode we should not try to decode the data.
|
||||
if not data:
|
||||
return data
|
||||
for filter_name, params in zip(filters, decode_parms):
|
||||
if isinstance(params, NullObject):
|
||||
params = {}
|
||||
if filter_name in (FT.ASCII_HEX_DECODE, FTA.AHx):
|
||||
data = ASCIIHexDecode.decode(data)
|
||||
elif filter_name in (FT.ASCII_85_DECODE, FTA.A85):
|
||||
data = ASCII85Decode.decode(data)
|
||||
elif filter_name in (FT.LZW_DECODE, FTA.LZW):
|
||||
data = LZWDecode._decodeb(data, params)
|
||||
elif filter_name in (FT.FLATE_DECODE, FTA.FL):
|
||||
data = FlateDecode.decode(data, params)
|
||||
elif filter_name in (FT.RUN_LENGTH_DECODE, FTA.RL):
|
||||
data = RunLengthDecode.decode(data)
|
||||
elif filter_name == FT.CCITT_FAX_DECODE:
|
||||
height = stream.get(IA.HEIGHT, ())
|
||||
data = CCITTFaxDecode.decode(data, params, height)
|
||||
elif filter_name == FT.DCT_DECODE:
|
||||
data = DCTDecode.decode(data)
|
||||
elif filter_name == FT.JPX_DECODE:
|
||||
data = JPXDecode.decode(data)
|
||||
elif filter_name == "/Crypt":
|
||||
if "/Name" in params or "/Type" in params:
|
||||
raise NotImplementedError(
|
||||
"/Crypt filter with /Name or /Type not supported yet"
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported filter {filter_name}")
|
||||
return data
|
||||
|
||||
|
||||
def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, Any]:
|
||||
"""
|
||||
Users need to have the pillow package installed.
|
||||
|
||||
It's unclear if pypdf will keep this function here, hence it's private.
|
||||
It might get removed at any point.
|
||||
|
||||
Args:
|
||||
x_object_obj:
|
||||
|
||||
Returns:
|
||||
Tuple[file extension, bytes, PIL.Image.Image]
|
||||
|
||||
"""
|
||||
from ._xobj_image_helpers import (
|
||||
Image,
|
||||
UnidentifiedImageError,
|
||||
_apply_decode,
|
||||
_extended_image_frombytes,
|
||||
_get_mode_and_invert_color,
|
||||
_handle_flate,
|
||||
_handle_jpx,
|
||||
)
|
||||
|
||||
def _apply_alpha(
|
||||
img: Image.Image,
|
||||
x_object_obj: Dict[str, Any],
|
||||
obj_as_text: str,
|
||||
image_format: str,
|
||||
extension: str,
|
||||
) -> Tuple[Image.Image, str, str]:
|
||||
alpha = None
|
||||
if IA.S_MASK in x_object_obj: # add alpha channel
|
||||
alpha = _xobj_to_image(x_object_obj[IA.S_MASK])[2]
|
||||
if img.size != alpha.size:
|
||||
logger_warning(
|
||||
f"image and mask size not matching: {obj_as_text}", __name__
|
||||
)
|
||||
else:
|
||||
# TODO : implement mask
|
||||
if alpha.mode != "L":
|
||||
alpha = alpha.convert("L")
|
||||
if img.mode == "P":
|
||||
img = img.convert("RGB")
|
||||
elif img.mode == "1":
|
||||
img = img.convert("L")
|
||||
img.putalpha(alpha)
|
||||
if "JPEG" in image_format:
|
||||
extension = ".jp2"
|
||||
image_format = "JPEG2000"
|
||||
else:
|
||||
extension = ".png"
|
||||
image_format = "PNG"
|
||||
return img, extension, image_format
|
||||
|
||||
# for error reporting
|
||||
obj_as_text = (
|
||||
x_object_obj.indirect_reference.__repr__()
|
||||
if x_object_obj is None # pragma: no cover
|
||||
else x_object_obj.__repr__()
|
||||
)
|
||||
|
||||
# Get size and data
|
||||
size = (cast(int, x_object_obj[IA.WIDTH]), cast(int, x_object_obj[IA.HEIGHT]))
|
||||
data = x_object_obj.get_data() # type: ignore
|
||||
if isinstance(data, str): # pragma: no cover
|
||||
data = data.encode()
|
||||
if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n'
|
||||
data = data[:-1]
|
||||
|
||||
# Get color properties
|
||||
colors = x_object_obj.get("/Colors", 1)
|
||||
color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object()
|
||||
if isinstance(color_space, list) and len(color_space) == 1:
|
||||
color_space = color_space[0].get_object()
|
||||
|
||||
mode, invert_color = _get_mode_and_invert_color(x_object_obj, colors, color_space)
|
||||
|
||||
# Get filters
|
||||
filters = x_object_obj.get(SA.FILTER, NullObject()).get_object()
|
||||
lfilters = filters[-1] if isinstance(filters, list) else filters
|
||||
|
||||
extension = None
|
||||
if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE):
|
||||
img, image_format, extension, _ = _handle_flate(
|
||||
size,
|
||||
data,
|
||||
mode,
|
||||
color_space,
|
||||
colors,
|
||||
obj_as_text,
|
||||
)
|
||||
elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE, FT.CCITT_FAX_DECODE):
|
||||
# I'm not sure if the following logic is correct.
|
||||
# There might not be any relationship between the filters and the
|
||||
# extension
|
||||
if lfilters in (FT.LZW_DECODE, FT.CCITT_FAX_DECODE):
|
||||
extension = ".tiff" # mime_type = "image/tiff"
|
||||
image_format = "TIFF"
|
||||
else:
|
||||
extension = ".png" # mime_type = "image/png"
|
||||
image_format = "PNG"
|
||||
try:
|
||||
img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
|
||||
except UnidentifiedImageError:
|
||||
img = _extended_image_frombytes(mode, size, data)
|
||||
elif lfilters == FT.DCT_DECODE:
|
||||
img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
|
||||
# invert_color kept unchanged
|
||||
elif lfilters == FT.JPX_DECODE:
|
||||
img, image_format, extension, invert_color = _handle_jpx(
|
||||
size, data, mode, color_space, colors
|
||||
)
|
||||
elif lfilters == FT.CCITT_FAX_DECODE:
|
||||
img, image_format, extension, invert_color = (
|
||||
Image.open(BytesIO(data), formats=("TIFF",)),
|
||||
"TIFF",
|
||||
".tiff",
|
||||
False,
|
||||
)
|
||||
elif mode == "CMYK":
|
||||
img, image_format, extension, invert_color = (
|
||||
_extended_image_frombytes(mode, size, data),
|
||||
"TIFF",
|
||||
".tif",
|
||||
False,
|
||||
)
|
||||
elif mode == "":
|
||||
raise PdfReadError(f"ColorSpace field not found in {x_object_obj}")
|
||||
else:
|
||||
img, image_format, extension, invert_color = (
|
||||
_extended_image_frombytes(mode, size, data),
|
||||
"PNG",
|
||||
".png",
|
||||
False,
|
||||
)
|
||||
|
||||
img = _apply_decode(img, x_object_obj, lfilters, color_space, invert_color)
|
||||
img, extension, image_format = _apply_alpha(
|
||||
img, x_object_obj, obj_as_text, image_format, extension
|
||||
)
|
||||
|
||||
# Save image to bytes
|
||||
img_byte_arr = BytesIO()
|
||||
try:
|
||||
img.save(img_byte_arr, format=image_format)
|
||||
except OSError: # pragma: no cover # covered with pillow 10.3
|
||||
# in case of we convert to RGBA and then to PNG
|
||||
img1 = img.convert("RGBA")
|
||||
image_format = "PNG"
|
||||
extension = ".png"
|
||||
img_byte_arr = BytesIO()
|
||||
img1.save(img_byte_arr, format=image_format)
|
||||
data = img_byte_arr.getvalue()
|
||||
|
||||
try: # temporary try/except until other fixes of images
|
||||
img = Image.open(BytesIO(data))
|
||||
except Exception:
|
||||
img = None # type: ignore
|
||||
return extension, data, img
|
||||
238
venv/lib/python3.12/site-packages/pypdf/generic/__init__.py
Normal file
238
venv/lib/python3.12/site-packages/pypdf/generic/__init__.py
Normal file
@ -0,0 +1,238 @@
|
||||
# Copyright (c) 2006, Mathieu Fenniak
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# * The name of the author may not be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
"""Implementation of generic PDF objects (dictionary, number, string, ...)."""
|
||||
__author__ = "Mathieu Fenniak"
|
||||
__author_email__ = "biziqe@mathieu.fenniak.net"
|
||||
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
from .._utils import (
|
||||
deprecation_with_replacement,
|
||||
)
|
||||
from ..constants import OutlineFontFlag
|
||||
from ._base import (
|
||||
BooleanObject,
|
||||
ByteStringObject,
|
||||
FloatObject,
|
||||
IndirectObject,
|
||||
NameObject,
|
||||
NullObject,
|
||||
NumberObject,
|
||||
PdfObject,
|
||||
TextStringObject,
|
||||
encode_pdfdocencoding,
|
||||
is_null_or_none,
|
||||
)
|
||||
from ._data_structures import (
|
||||
ArrayObject,
|
||||
ContentStream,
|
||||
DecodedStreamObject,
|
||||
Destination,
|
||||
DictionaryObject,
|
||||
EncodedStreamObject,
|
||||
Field,
|
||||
StreamObject,
|
||||
TreeObject,
|
||||
read_object,
|
||||
)
|
||||
from ._files import EmbeddedFile
|
||||
from ._fit import Fit
|
||||
from ._outline import OutlineItem
|
||||
from ._rectangle import RectangleObject
|
||||
from ._utils import (
|
||||
create_string_object,
|
||||
decode_pdfdocencoding,
|
||||
hex_to_rgb,
|
||||
read_hex_string_from_stream,
|
||||
read_string_from_stream,
|
||||
)
|
||||
from ._viewerpref import ViewerPreferences
|
||||
|
||||
PAGE_FIT = Fit.fit()
|
||||
|
||||
|
||||
class AnnotationBuilder: # deprecated
|
||||
"""
|
||||
The AnnotationBuilder is deprecated.
|
||||
|
||||
Instead, use the annotation classes in pypdf.annotations.
|
||||
|
||||
See `adding PDF annotations <../user/adding-pdf-annotations.html>`_ for
|
||||
its usage combined with PdfWriter.
|
||||
"""
|
||||
|
||||
from ..generic._rectangle import RectangleObject
|
||||
|
||||
@staticmethod
|
||||
def text(
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
text: str,
|
||||
open: bool = False,
|
||||
flags: int = 0,
|
||||
) -> None:
|
||||
deprecation_with_replacement(
|
||||
"AnnotationBuilder.text", "pypdf.annotations.Text", "5.0.0"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def free_text(
|
||||
text: str,
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
font: str = "Helvetica",
|
||||
bold: bool = False,
|
||||
italic: bool = False,
|
||||
font_size: str = "14pt",
|
||||
font_color: str = "000000",
|
||||
border_color: Optional[str] = "000000",
|
||||
background_color: Optional[str] = "ffffff",
|
||||
) -> None:
|
||||
deprecation_with_replacement(
|
||||
"AnnotationBuilder.free_text", "pypdf.annotations.FreeText", "5.0.0"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def popup(
|
||||
*,
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
flags: int = 0,
|
||||
parent: Optional[DictionaryObject] = None,
|
||||
open: bool = False,
|
||||
) -> None:
|
||||
deprecation_with_replacement(
|
||||
"AnnotationBuilder.popup", "pypdf.annotations.Popup", "5.0.0"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def line(
|
||||
p1: Tuple[float, float],
|
||||
p2: Tuple[float, float],
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
text: str = "",
|
||||
title_bar: Optional[str] = None,
|
||||
) -> None:
|
||||
deprecation_with_replacement(
|
||||
"AnnotationBuilder.line", "pypdf.annotations.Line", "5.0.0"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def polyline(
|
||||
vertices: List[Tuple[float, float]],
|
||||
) -> None:
|
||||
deprecation_with_replacement(
|
||||
"AnnotationBuilder.polyline", "pypdf.annotations.PolyLine", "5.0.0"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def rectangle(
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
interiour_color: Optional[str] = None,
|
||||
) -> None:
|
||||
deprecation_with_replacement(
|
||||
"AnnotationBuilder.rectangle", "pypdf.annotations.Rectangle", "5.0.0"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def highlight(
|
||||
*,
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
quad_points: ArrayObject,
|
||||
highlight_color: str = "ff0000",
|
||||
printing: bool = False,
|
||||
) -> None:
|
||||
deprecation_with_replacement(
|
||||
"AnnotationBuilder.highlight", "pypdf.annotations.Highlight", "5.0.0"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def ellipse(
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
interiour_color: Optional[str] = None,
|
||||
) -> None:
|
||||
deprecation_with_replacement(
|
||||
"AnnotationBuilder.ellipse", "pypdf.annotations.Ellipse", "5.0.0"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def polygon(vertices: List[Tuple[float, float]]) -> None:
|
||||
deprecation_with_replacement(
|
||||
"AnnotationBuilder.polygon", "pypdf.annotations.Polygon", "5.0.0"
|
||||
)
|
||||
|
||||
from ._fit import DEFAULT_FIT
|
||||
|
||||
@staticmethod
|
||||
def link(
|
||||
rect: Union[RectangleObject, Tuple[float, float, float, float]],
|
||||
border: Optional[ArrayObject] = None,
|
||||
url: Optional[str] = None,
|
||||
target_page_index: Optional[int] = None,
|
||||
fit: Fit = DEFAULT_FIT,
|
||||
) -> None:
|
||||
deprecation_with_replacement(
|
||||
"AnnotationBuilder.link", "pypdf.annotations.Link", "5.0.0"
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"PAGE_FIT",
|
||||
"AnnotationBuilder",
|
||||
"ArrayObject",
|
||||
"BooleanObject",
|
||||
"ByteStringObject",
|
||||
"ContentStream",
|
||||
"DecodedStreamObject",
|
||||
"Destination",
|
||||
"DictionaryObject",
|
||||
"EmbeddedFile",
|
||||
"EncodedStreamObject",
|
||||
"Field",
|
||||
"Fit",
|
||||
"FloatObject",
|
||||
"IndirectObject",
|
||||
"NameObject",
|
||||
"NullObject",
|
||||
"NumberObject",
|
||||
"OutlineFontFlag",
|
||||
"OutlineItem",
|
||||
"PdfObject",
|
||||
"RectangleObject",
|
||||
"StreamObject",
|
||||
"TextStringObject",
|
||||
"TreeObject",
|
||||
"ViewerPreferences",
|
||||
# Utility functions
|
||||
"create_string_object",
|
||||
"decode_pdfdocencoding",
|
||||
"encode_pdfdocencoding",
|
||||
"hex_to_rgb",
|
||||
"is_null_or_none",
|
||||
"read_hex_string_from_stream",
|
||||
# Data structures core functions
|
||||
"read_object",
|
||||
"read_string_from_stream",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user