| import os |
| from io import BytesIO |
| from tqdm import tqdm |
| import numpy as np |
| from typing import Callable, Dict, List |
| from PIL import Image as PIL_Image |
| from PIL.Image import Image |
|
|
| from datasets import logging |
|
|
| logger = logging.get_logger(__name__) |
| import PyPDF2 |
|
|
| MAX_PAGES = 50 |
| MAX_PDF_SIZE = 100000000 |
| MIN_WIDTH, MIN_HEIGHT = 150, 150 |
| import pdf2image |
|
|
|
|
| def pdf2image_image_extraction(pdf_stream): |
| try: |
| images: List[Image] = pdf2image.convert_from_bytes(pdf_stream) |
| return images |
| except Exception as e: |
| logger.warning(f"{e}") |
|
|
|
|
| def pdf_to_pixelvalues_extractor(example, feature_extractor, inference_method): |
| example["pages"] = 0 |
| example["pixel_values"] = None |
| pixel_values = [] |
| if len(example["file"]) > MAX_PDF_SIZE: |
| logger.warning(f"too large file {len(example['file'])}") |
| return example |
| try: |
| reader = PyPDF2.PdfReader(BytesIO(example["file"])) |
| except Exception as e: |
| logger.warning(f"read_pdf {e}") |
| return example |
| example["pages"] = len(reader.pages) |
| reached_page_limit = False |
| if "sample" in inference_method.scope and inference_method.scope != "sample-grid": |
| page_iterator = [inference_method.get_page_scope(reader.pages)] |
| else: |
| page_iterator = reader.pages |
|
|
| try: |
| for p, page in enumerate(page_iterator): |
| if reached_page_limit: |
| break |
| for image in page.images: |
| if len(pixel_values) == MAX_PAGES: |
| reached_page_limit = True |
| break |
| im = PIL_Image.open(BytesIO(image.data)) |
| if im.width < MIN_WIDTH and im.height < MIN_HEIGHT: |
| continue |
| |
| |
| |
| if inference_method.scope != "sample-grid": |
| im = feature_extractor([im.convert("RGB")])["pixel_values"][0] |
| pixel_values.append(im) |
| except Exception as e: |
| print(f"{example.get('id')} PyPDF get_images {e}") |
| pixel_values = [] |
|
|
| if len(pixel_values) == 0: |
| |
| try: |
| images = pdf2image_image_extraction(example["file"]) |
| except Exception as e: |
| print(f"{example.get('id')} pdf2image get_images {e}") |
| images = [] |
|
|
| if not images: |
| print(f"{example.get('id')} pdf2image has no images") |
| example["pages"] = 0 |
| return example |
|
|
| |
| example["pages"] = len(images) |
| for im in images: |
| if len(pixel_values) == MAX_PAGES: |
| reached_page_limit = True |
| break |
| if im.width < MIN_WIDTH and im.height < MIN_HEIGHT: |
| continue |
| if inference_method.scope != "sample-grid": |
| im = feature_extractor([im.convert("RGB")])["pixel_values"][0] |
| pixel_values.append(im) |
|
|
| if inference_method.scope == "sample-grid": |
| grid = inference_method.get_page_scope(pixel_values) |
| pixel_values = feature_extractor([grid.convert("RGB")])["pixel_values"][0] |
| elif "sample" in inference_method.scope: |
| pixel_values = pixel_values[0] |
| example["pixel_values"] = np.array(pixel_values) |
| return example |
|
|
|
|
| def nativepdf_to_pixelvalues_extractor(example, feature_extractor, inference_method): |
| IMPOSSIBLE = ["6483941-Letter-to-John-Campbell.pdf", "7276809-Ocoee-Newspaper-Pages.pdf"] |
| example["pages"] = 0 |
| example["pixel_values"] = None |
| pixel_values = [] |
| if len(example["file"]) > MAX_PDF_SIZE: |
| logger.warning(f"too large file {len(example['file'])}") |
| return example |
|
|
| |
| try: |
| images = pdf2image_image_extraction(example["file"]) |
| except Exception as e: |
| print(f"{example.get('id')} pdf2image get_images {e}") |
| images = [] |
|
|
| if not images: |
| print(f"{example.get('id')} pdf2image has no images") |
| example["pages"] = 0 |
| return example |
|
|
| |
| images = [im for im in images if im.width >= MIN_WIDTH and im.height >= MIN_HEIGHT] |
|
|
| if not images or (example.get("id") in IMPOSSIBLE and inference_method.scope == "sample-grid"): |
| print(f"{example.get('id')} pdf2image has no images") |
| example["pages"] = 0 |
| return example |
|
|
| example["pages"] = len(images) |
| reached_page_limit = False |
| if "sample" in inference_method.scope and inference_method.scope != "sample-grid": |
| page_iterator = [inference_method.get_page_scope(images)] |
| else: |
| page_iterator = images |
|
|
| for im in page_iterator: |
| if len(pixel_values) == MAX_PAGES: |
| reached_page_limit = True |
| break |
| if inference_method.scope != "sample-grid": |
| im = feature_extractor([im.convert("RGB")])["pixel_values"][0] |
| pixel_values.append(im) |
|
|
| if len(pixel_values) == 0: |
| print(f"{example.get('id')} pdf2image has no valid images") |
| example["pages"] = 0 |
| return example |
|
|
| if inference_method.scope == "sample-grid": |
| grid = inference_method.get_page_scope(pixel_values) |
| pixel_values = feature_extractor([grid.convert("RGB")])["pixel_values"][0] |
| elif "sample" in inference_method.scope: |
| pixel_values = pixel_values[0] |
| example["pixel_values"] = np.array(pixel_values) |
| return example |
|
|