#!/usr/bin/python # Sys and os for args and concatenating paths import sys import os # argparse import argparse # glob is for iterating files in a directory import glob # binascii stuff for reading image raw data / header from binascii import b2a_hex # Regular expression stuff import re # pdfminer Imports from pdfminer.pdfparser import PDFParser, PDFDocument, PDFNoOutlines from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage ### ### Extracting Images ### def write_file (folder, filename, filedata, flags='w'): """Write the file data to the folder and filename combination (flags: 'w' for write text, 'wb' for write binary, use 'a' instead of 'w' for append)""" result = False if os.path.isdir(folder): try: file_obj = open(os.path.join(folder, filename), flags) file_obj.write(filedata) file_obj.close() result = True except IOError: pass return result def determine_image_type (stream_first_4_bytes): """Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes""" file_type = None bytes_as_hex = b2a_hex(stream_first_4_bytes) if bytes_as_hex.startswith('ffd8'): file_type = '.jpeg' elif bytes_as_hex == '89504e47': file_type = '.png' elif bytes_as_hex == '47494638': file_type = '.gif' elif bytes_as_hex.startswith('424d'): file_type = '.bmp' return file_type def save_image (lt_image, page_number, images_folder): """Try to save the image data from this LTImage object, and return the file name, if successful""" result = None if lt_image.stream: file_stream = lt_image.stream.get_rawdata() if file_stream: file_ext = determine_image_type(file_stream[0:4]) if file_ext: file_name = ''.join([str(page_number), '_', lt_image.name, file_ext]) if write_file(images_folder, file_name, file_stream, flags='wb'): result = file_name return result # Iterate recursive through PDF layout objects and extract text stuff def parse_lt_objs (lt_objs, page_number, image_path, text=[]): """Iterate through the list of LT* objects and capture the text or image data contained in each""" text_content = [] for lt_obj in lt_objs: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): # text text_content.append(lt_obj.get_text()) elif isinstance(lt_obj, LTImage): # an image, so save it to the designated folder, if -i was given if image_path != '': saved_file = save_image(lt_obj, page_number, image_path) if saved_file == None: print 'Error saving image on page ' + str(page_number) + ' to ' + image_path else: print 'Saved image ' + saved_file elif isinstance(lt_obj, LTFigure): # LTFigure objects are containers for other LT* objects, so recurse through the children text_content.append(parse_lt_objs(lt_obj, page_number, image_path, text_content)) return '\n'.join(text_content) # Parse each page of the pdf document, from within this function parsing # of layout objects is done def _parse_pages (doc, *args): """With an open PDFDocument object, get the pages, parse each one, and return the entire text [this is a higher-order function to be passed to with_pdf()]""" rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] # a list of strings, each representing text collected from each page of the doc for i, page in enumerate(doc.get_pages()): interpreter.process_page(page) # receive the LTPage object for this page layout = device.get_result() # layout is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc. image_path = args[1] page_text = parse_lt_objs(layout, (i+1), image_path) text_content.append(page_text) # Search, only one additional argument should be supplied, which is # a list of arguments regexList = args[0] for regex in regexList: for hit in re.findall(regex, page_text): print "HIT on page " + str(i) + ': ' + hit return text_content # General function, that does pdf file loading and applies a parse # function to our document def with_pdf(pdf_doc, pdf_pwd, fn, *args): """Open the pdf document, and apply the function, returning the results""" result = None try: # open the pdf file fp = open(pdf_doc, 'rb') # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument() # connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) # supply the password for initialization doc.initialize(pdf_pwd) if doc.is_extractable: # apply the function and return the result result = fn(doc, *args) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass return result def get_pages (pdf_doc, regexList, image_path): """Process each of the pages in this pdf file and print the entire text to stdout""" # print '\n\n'.join(with_pdf(pdf_doc, pdf_pwd, _parse_pages)) pdf_pwd = '' return '\n\n'.join(with_pdf(pdf_doc, pdf_pwd, _parse_pages, regexList, image_path)) def search_pages(pdf_doc, regexList, image_path): docText = get_pages(pdf_doc, regexList, image_path) # print docText def main(): parser = argparse.ArgumentParser() parser.add_argument('-p', action='store', dest='pdf_path', required=True, help='Path to a directory with PDF files to be parsed') parser.add_argument('-e', action='append', dest='regexList', default=[], help='Regular expression for searching PDFs') parser.add_argument('-i', action='store', dest='image_path', default='', help='Optional path for saving images found within PDF') results = parser.parse_args() # Iterate through all pdf files for pdffile in glob.glob(os.path.join(results.pdf_path, '*.pdf')): print "\n\nCurrent file is: " + pdffile for i in range(0,50): print '=', print '' search_pages(pdffile, results.regexList, results.image_path) if __name__ == "__main__": main()