PDF and PS scan script in Python
This is the script I use for scanning the number of pages of ALL PDF and PS files in one folder.
I use it every day at Terradox, so it’s very useful for me… especially when I have folders with hundreds of files.
Here’s the code:
#!/usr/bin/env python
import os
from re import compile
from time import clock
def _scan_file_(file):
#
file = file.lower()
vPages = 0
#
if file.endswith('ps'): # If PS.
print('Scanning pages from PS `%s`' % os.path.split(file)[1])
vFileOpen = open(file, 'rb')
vContent = vFileOpen.read()
vFileOpen.close() ; del vFileOpen
#
vPSpatt = compile(b"%%Page:[ ]['(']Page[ ]\d*[')'][ ](\d*)")
for vMatch in vPSpatt.finditer(vContent):
vPages = int(vMatch.group(1))
#if vPages: break # This will return incorrect number of pages
del vContent
#
elif file.endswith('pdf'): # If PDF.
#
try: # Try to use Acrobat Reader Professional, on Windows.
from win32com.client import Dispatch
pdfdoc = Dispatch('AcroExch.PDdoc')
print('Scan pages from PDF using Acrobat Reader.')
pdfdoc.Open(file)
vPages = pdfdoc.GetNumPages()
pdfdoc.Close() ; del pdfdoc
#
except: # If Acrobat not available, using fallback.
print('Scan pages from PDF using fallback.')
vFileOpen = open(file, 'rb')
vContent = vFileOpen.read()
vFileOpen.close() ; del vFileOpen
#
vPDFpatt = compile(b"/Count[ ](\d+)")
for vMatch in vPDFpatt.finditer(vContent):
vPages = int(vMatch.group(1))
del vContent
#
#
else:
print('File "%s" is neither PS, nor PDF.' % file)
#
return vPages
#
if __name__=='__main__':
ti = clock()
vFolder = 'D:/My Folder with PDF and PS Files'
for f in os.listdir(vFolder):
print 'Scanned:', _scan_file_(vFolder + '\\' + f), 'pages.'
tf = clock()
print('Operation took %.4f seconds.\n' % (tf-ti))
Hope you find it useful 😊
Originally posted at: https://crconstantin.tumblr.com/post/19799142826/