from unstructured library opensource one when i tried a pdf that have background images design patterns and XObjects in it this library also consider those as a images and store the path. so how can we clean the pdfs and store only the images that are in the pdfs as a figure?
import os
from io import StringIO
from lxml import etree
import pandas as pd
from unstructured.partition.pdf import partition_pdf
import json
class PDFProcessor:
def __init__(self):
"""Extract structured elements (text, images, tables, equations) from a PDF."""
pass
def process_pdf(self, path: str, max_characters=30000):
try:
pdf_elements = partition_pdf(
filename=path,
extract_images_in_pdf=True,
strategy='hi_res',
infer_table_structure=True,
extract_image_block_types=["Image"],
# extract_image_block_to_payload=True,
max_character=max_characters,
)
chunks = [el.to_dict() for el in pdf_elements ]
text_data = [el for el in chunks if el["type"] in ["NarrativeText", "Title", "ListItem", "Text", "FigureCaption","UncategorizedText"]]
image_data = [el for el in chunks if el["type"] == "Image"]
table_data = [el for el in chunks if el["type"] == "Table"]
return {
"text": text_data,
"images": image_data,
"tables": table_data
}
except Exception as e:
print(f"[ERROR] Cannot process the document: {e}")
return {
"text": [],
"images": [],
"tables": [],
"error": str(e)
}