Skip to content

Instantly share code, notes, and snippets.

@lebedov
Created April 28, 2021 12:29
Show Gist options
  • Save lebedov/3518142a5c2431b8c9a28d323100558a to your computer and use it in GitHub Desktop.
Save lebedov/3518142a5c2431b8c9a28d323100558a to your computer and use it in GitHub Desktop.
How to use pdfbox's PDFTextStripper class in Python.
#!/usr/bin/env python3
"""
How to use pdfbox's PDFTextStripper class in Python.
"""
import pathlib
import pkg_resources
import re
import urllib.request
import appdirs
import jpype
import jpype.imports
import numpy as np
# Replace with path to pdfbox jar file:
a = appdirs.AppDirs('python-pdfbox')
cache_dir = pathlib.Path(a.user_cache_dir)
file_list = list(cache_dir.glob('pdfbox-app-*.jar'))
def f(s):
v = re.search('pdfbox-app-([\w\.\-]+)\.jar', s.name).group(1)
return pkg_resources.parse_version(v)
jpype.addClassPath(sorted(file_list, key=f)[-1])
if not jpype.isJVMStarted():
jpype.startJVM(jpype.getDefaultJVMPath(), '-Djava.awt.headless=true', convertStrings=False)
from java.awt.image import BufferedImage
from java.io import File
from org.apache.pdfbox.pdmodel import PDDocument
from org.apache.pdfbox.text import PDFTextStripper
def extract_text(in_file):
"""
Extract text of PDF file.
Parameters
----------
in_file : str
Path to input PDF file.
Returns
-------
text : str
Extracted text.
"""
doc = PDDocument.load(File(in_file))
pdf_text_stripper = PDFTextStripper()
text = pdf_text_stripper.getText(doc)
return str(text)
if __name__ == '__main__':
import os
import tempfile
import urllib
# Download sample multipage PDF:
data = urllib.request.urlopen('https://researchtorevenue.files.wordpress.com/2015/04/1r41ai10801601_fong.pdf').read()
fd, name = tempfile.mkstemp()
f = open(name, 'wb')
f.write(data)
f.close()
result = extract_text(name)
os.unlink(name)
@sfinotti
Copy link

@lebedov Thank you again. You nailed it !!! The file was corrupted I downloaded it again and it's working now. Thank you !!!

@mara004
Copy link

mara004 commented Jun 22, 2023

Nice example, almost looks easier than using the CLI!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment