PDF and PS scan script in Python

2010 Aug 14, Sat tumblr

This is the script I use for scanning the number of pages of ALL PDF and PS files in one folder.
I use it every day at Terradox, so it’s very useful for me… especially when I have folders with hundreds of files.

Here’s the code:

#!/usr/bin/env python
import os
from re import compile
from time import clock

def _scan_file_(file):
    #
    file = file.lower()
    vPages = 0
    #
    if file.endswith('ps'): # If PS.
        print('Scanning pages from PS `%s`' % os.path.split(file)[1])
        vFileOpen = open(file, 'rb')
        vContent = vFileOpen.read()
        vFileOpen.close() ; del vFileOpen
        #
        vPSpatt = compile(b"%%Page:[ ]['(']Page[ ]\d*[')'][ ](\d*)")
        for vMatch in vPSpatt.finditer(vContent):
            vPages = int(vMatch.group(1))
            #if vPages: break # This will return incorrect number of pages
        del vContent
    #
    elif file.endswith('pdf'): # If PDF.
        #
        try: # Try to use Acrobat Reader Professional, on Windows.
            from win32com.client import Dispatch
            pdfdoc = Dispatch('AcroExch.PDdoc')
            print('Scan pages from PDF using Acrobat Reader.')
            pdfdoc.Open(file)
            vPages = pdfdoc.GetNumPages()
            pdfdoc.Close() ; del pdfdoc
        #
        except: # If Acrobat not available, using fallback.
            print('Scan pages from PDF using fallback.')
            vFileOpen = open(file, 'rb')
            vContent = vFileOpen.read()
            vFileOpen.close() ; del vFileOpen
            #
            vPDFpatt = compile(b"/Count[ ](\d+)")
            for vMatch in vPDFpatt.finditer(vContent):
                vPages = int(vMatch.group(1))
            del vContent
        #
    #
    else:
        print('File "%s" is neither PS, nor PDF.' % file)
    #
    return vPages
    #

if __name__=='__main__':
    ti = clock()
    vFolder = 'D:/My Folder with PDF and PS Files'
    for f in os.listdir(vFolder):
        print 'Scanned:', _scan_file_(vFolder + '\\' + f), 'pages.'
    tf = clock()
    print('Operation took %.4f seconds.\n' % (tf-ti))

Hope you find it useful 😊

Originally posted at: https://crconstantin.tumblr.com/post/19799142826/

@notes #dev #programming #python