Skip to content

Instantly share code, notes, and snippets.

@lebedov
Last active July 10, 2023 14:04
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lebedov/cefbec588c2b2bd0251ba505bd8bf933 to your computer and use it in GitHub Desktop.
Save lebedov/cefbec588c2b2bd0251ba505bd8bf933 to your computer and use it in GitHub Desktop.
How to call pdfbox's API with JPype.
#!/usr/bin/env python3
"""
How to call pdfbox's API with JPype.
"""
import pathlib
import pkg_resources
import re
import urllib.request
import appdirs
import jpype
import jpype.imports
import numpy as np
# Replace with path to pdfbox jar file:
a = appdirs.AppDirs('python-pdfbox')
cache_dir = pathlib.Path(a.user_cache_dir)
file_list = list(cache_dir.glob('pdfbox-app-*.jar'))
def f(s):
v = re.search('pdfbox-app-([\w\.\-]+)\.jar', s.name).group(1)
return pkg_resources.parse_version(v)
jpype.addClassPath(sorted(file_list, key=f)[-1])
if not jpype.isJVMStarted():
jpype.startJVM(jpype.getDefaultJVMPath(), '-Djava.awt.headless=true', convertStrings=False)
from java.awt.image import BufferedImage
from java.io import File
from org.apache.pdfbox.pdmodel import PDDocument
from org.apache.pdfbox.rendering import ImageType, PDFRenderer
def extract_images(in_file, pages=None, dpi=72):
"""
Extract pages of PDF file as images.
Parameters
----------
in_file : str
Path to input PDF file.
pages : iterable
Numbers of pages to extract (0-indexed). If None, return all pages.
dpi : int
Resolution at which to render output images.
Returns
-------
output : list of numpy.ndarray
PDF pages rendered into RGB numpy arrays.
"""
doc = PDDocument.load(File(in_file))
pdf_renderer = PDFRenderer(doc)
output = []
if pages == None:
pages = range(doc.getNumberOfPages())
for i in pages:
im = pdf_renderer.renderImageWithDPI(i, dpi)
h = im.getHeight()
w = im.getWidth()
# Retrieve data as numpy array of RGB values packed into int32:
data = im.getRGB(0, 0, w, h, None, 0, w)[:]
# Separate RGB channels, return as array of bytes:
output.append(np.frombuffer(memoryview(data), np.uint8).reshape(h, w, 4)[..., :3])
return output
if __name__ == '__main__':
import os
import tempfile
import urllib
# Download sample multipage PDF:
data = urllib.request.urlopen('https://researchtorevenue.files.wordpress.com/2015/04/1r41ai10801601_fong.pdf').read()
fd, name = tempfile.mkstemp()
f = open(name, 'wb')
f.write(data)
f.close()
result = extract_images(name)
os.unlink(name)
@mara004
Copy link

mara004 commented Jun 22, 2023

I know this is just an example, but for anyone copying this code, note that using a generator (yield expression) would be better in terms of memory management than stacking all bitmaps in a list.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment