Hi,
I created a Python script using pyPDF that automatically merges all the PDFs in a folder and puts them into an output folder and rename them automatically as per the folder's name.
What I want to do now is for the script to search for sub-directories, process all the PDF files in them and create an output in the sub-directory and give it the same name as the sub=dir.
I have been trying to use the os.path.walk() function but I can't get the hang of it or understand it.
This is my current code:
#----------------------------------------------------------------------------------------------
# Name: pdfMerger
# Purpose: Automatic merging of all PDF files in a directory and its sub-directories and
# rename them according to the folder itself. Requires the pyPDF Module
#
# Current: Processes all the PDF files in the current directory
# To-Do: Process the sub-directories.
#
# Version: 1.0
# Author: Brian Livori
#
# Created: 03/08/2011
# Copyright: (c) Brian Livori 2011
# Licence: Open-Source
#---------------------------------------------------------------------------------------------
#!/usr/bin/env python
import os
import glob
import sys
from pyPdf import PdfFileReader, PdfFileWriter
output = PdfFileWriter()
path = str(os.getcwd())
x = 0
for infile in glob.glob( os.path.join(path, '*.pdf') ):
for (path, dirs, files) in os.walk(path, topdown=True):
for files in dirs:
i = 0
print "Merging " + infile
pdf = PdfFileReader(file( infile, "rb"))
x = pdf.getNumPages()
while (i != x):
output.addPage(pdf.getPage(i))
print "Merging page: " + str(i+1) + "/" + str(x)
i += 1
output_dir = "\Output\\"
ext = ".pdf"
dir = os.path.basename(path)
outputpath = str(os.getcwd()) + output_dir
final_output = outputpath
if os.path.exists(final_output) != True:
os.mkdir(final_output)
outputStream = file(final_output + dir + ext, "wb")
output.write(outputStream)
outputStream.close()
else:
outputStream = file(final_output + dir + ext, "wb")
output.write(outputStream)
outputStream.close() I do not understand this globbing before you are even inside walk loop.
Maybe you could check out my code for finding files of given list of extensions, those containing one of the list of texts:
http://www.daniweb.com/software-development/python/code/316585
There I am using the option of running function for all files instead of explicit looping.
What I want to do is get files in a folder and it's sub-directories and merge all the files in a folder (seperately - according to the folder they are in).
For example if there are 4 pdfs in the 'main' folder, the 4 pdfs will be merged and called main.pdf. If there's a sub-dir in the 'main' called 'testing' which contains 5 pdfs, the 5 pdf files are merged and called testing.pdf.
I need to do it recursively, so that all sub-directories are processed.
Any help on how to do it?
As of now, only the folders in the root folder are processed.
As I said you can simplify my code, anyway here version from it that process (here simple print) every .pdf file under topdir:
import os
def process_file(_, path, filelist):
for filename in filelist:
if filename.endswith('.pdf'):
print(os.path.basename(path), filename)
print(os.path.join(path, filename))
print('')
def files_recursively(topdir):
os.path.walk(os.path.realpath(topdir), process_file, ())
files_recursively('C:/')#----------------------------------------------------------------------------------------------
# Name: pdfMerger
# Purpose: Automatic merging of all PDF files in a directory and its sub-directories and
# rename them according to the folder itself. Requires the pyPDF Module
#
# Current: Processes all the PDF files in the current directory
# To-Do: Process the sub-directories.
#
# Version: 1.0
# Author: Brian Livori
#
# Created: 03/08/2011
# Copyright: (c) Brian Livori 2011
# Licence: Open-Source
#---------------------------------------------------------------------------------------------
#!/usr/bin/env python
import os
import glob
import sys
import fnmatch
from pyPdf import PdfFileReader, PdfFileWriter
output = PdfFileWriter()
path = str(os.getcwd())
x = 0
def process_file(_, path, filelist):
for filename in filelist:
if filename.endswith('.pdf'):
i = 0
print "Merging " + filename
pdf = PdfFileReader(file( filename, "rb"))
x = pdf.getNumPages()
def files_recursively(topdir):
os.path.walk(os.path.realpath(topdir), process_file, ())
files_recursively(path)
for infile in glob.glob( os.path.join(path, '*.pdf') ):
for (path, dirs, files) in os.walk(path, topdown=True):
for filename in find_files(path, '*.pdf'):
while (i != x):
output.addPage(pdf.getPage(i))
print "Merging page: " + str(i+1) + "/" + str(x)
i += 1
output_dir = "\Output\\"
ext = ".pdf"
dir = os.path.basename(path)
outputpath = str(os.getcwd()) + output_dir
final_output = outputpath
if os.path.exists(final_output) != True:
os.mkdir(final_output)
outputStream = file(final_output + dir + ext, "wb")
output.write(outputStream)
outputStream.close()
else:
outputStream = file(final_output + dir + ext, "wb")
output.write(outputStream)
outputStream.close()
This gives me:
IOError: [Errno 2] No such file or directory: 'LGA.CL1.651.2010.pdf'Ok. So I changed the code to this:
#----------------------------------------------------------------------------------------------
# Name: pdfMerger
# Purpose: Automatic merging of all PDF files in a directory and its sub-directories and
# rename them according to the folder itself. Requires the pyPDF Module
#
# Current: Processes all the PDF files in the current directory
# To-Do: Process the sub-directories.
#
# Version: 1.0
# Author: Brian Livori
#
# Created: 03/08/2011
# Copyright: (c) Brian Livori 2011
# Licence: Open-Source
#---------------------------------------------------------------------------------------------
#!/usr/bin/env python
import os
import glob
import sys
import fnmatch
from pyPdf import PdfFileReader, PdfFileWriter
output = PdfFileWriter()
path = str(os.getcwd())
x = 0
def process_file(_, path, filelist):
for filename in filelist:
if filename.endswith('.pdf'):
filename = os.path.join(path, filename)
print "Merging " + filename
pdf = PdfFileReader(file( filename, "rb"))
x = pdf.getNumPages()
i = 0
while (i != x):
output.addPage(pdf.getPage(i))
print "Merging page: " + str(i+1) + "/" + str(x)
i += 1
output_dir = "\Output\\"
ext = ".pdf"
dir = os.path.basename(path)
outputpath = str(os.getcwd()) + output_dir
final_output = outputpath
if os.path.exists(final_output) != True:
os.mkdir(final_output)
outputStream = file(final_output + dir + ext, "wb")
output.write(outputStream)
outputStream.close()
else:
outputStream = file(final_output + dir + ext, "wb")
output.write(outputStream)
outputStream.close()
def files_recursively(topdir):
os.path.walk(os.path.realpath(topdir), process_file, ())
files_recursively(path)
And it was working. I changed the pdf files in the folders (instead of two 100mb files I put 4 files of about 500KB each) and it is giving me an error.
Error Code:
Traceback (most recent call last):
File "C:\Documents and Settings\student3\Desktop\Test\pdfMergerV1_bkp.py", line 74, in <module>
files_recursively(path)
File "C:\Documents and Settings\student3\Desktop\Test\pdfMergerV1_bkp.py", line 72, in files_recursively
os.path.walk(os.path.realpath(topdir), process_file, ())
File "C:\Python27\lib\ntpath.py", line 263, in walk
walk(name, func, arg)
File "C:\Python27\lib\ntpath.py", line 259, in walk
func(arg, top, names)
File "C:\Documents and Settings\student3\Desktop\Test\pdfMergerV1_bkp.py", line 38, in process_file
pdf = PdfFileReader(file( filename, "rb"))
File "C:\Python27\lib\site-packages\pyPdf\pdf.py", line 374, in __init__
self.read(stream)
File "C:\Python27\lib\site-packages\pyPdf\pdf.py", line 775, in read
newTrailer = readObject(stream, self)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 67, in readObject
return DictionaryObject.readFromStream(stream, pdf)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 531, in readFromStream
value = readObject(stream, pdf)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 58, in readObject
return ArrayObject.readFromStream(stream, pdf)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 153, in readFromStream
arr.append(readObject(stream, pdf))
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 69, in readObject
return readHexStringFromStream(stream)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 276, in readHexStringFromStream
txt += chr(int(x, base=16))
ValueError: invalid literal for int() with base 16: '\x00\x00' You should use at least os.path.join at line 61 an 67 to make your code cleaner. Also I would use 4 spaces instead of tab indention, as code commes out terrible in posts, even editor shows it OK.
I would prefer to use also open instead of file
Looks like generic.py is getting '\0x00\x00' ie two nulls, when it expects hexadesimal number string ('00', '0000' or '0x000' or similar), which it would convert to char, so it should be expecting two hexdigit string.
You should use at least os.path.join at line 61 an 67 to make your code cleaner. Also I would use 4 spaces instead of tab indention, as code commes out terrible in posts, even editor shows it OK.
I would prefer to use also open instead of file
Looks like generic.py is getting '\0x00\x00' ie two nulls, when it expects hexadesimal number string ('00', '0000' or '0x000' or similar), which it would convert to char, so it should be expecting two hexdigit string.
Many null characters may mean a unicode encoding problem.