From 22aaed75be489493cb7e201f8997d94e496f3747 Mon Sep 17 00:00:00 2001 From: Jeremy Ashkenas Date: Fri, 20 Aug 2010 13:17:00 -0400 Subject: [PATCH 1/2] Attempting some GraphicsMagick / GhostScript memory and disk usage optimizations. --- docsplit.gemspec | 4 ++-- index.html | 10 ++++++++- lib/docsplit.rb | 2 +- lib/docsplit/image_extractor.rb | 37 +++++++++++++++++++++------------ 4 files changed, 36 insertions(+), 17 deletions(-) diff --git a/docsplit.gemspec b/docsplit.gemspec index 02c4bad..35318e4 100755 --- a/docsplit.gemspec +++ b/docsplit.gemspec @@ -1,7 +1,7 @@ Gem::Specification.new do |s| s.name = 'docsplit' - s.version = '0.3.3' # Keep version in sync with docsplit.rb - s.date = '2010-8-17' + s.version = '0.3.4' # Keep version in sync with docsplit.rb + s.date = '2010-8-20' s.homepage = "http://documentcloud.github.com/docsplit/" s.summary = "Break Apart Documents into Images, Text, Pages and PDFs" diff --git a/index.html b/index.html index 5f1a292..23c21ad 100755 --- a/index.html +++ b/index.html @@ -98,7 +98,7 @@

Docsplit

(title, author, number of pages...)

-

Docsplit is currently at version 0.3.3.

+

Docsplit is currently at version 0.3.4.

Docsplit is an open-source component of DocumentCloud. @@ -279,6 +279,14 @@

Internals

Change Log

+

+ 0.3.4
+ A number of Memory / Disk Space optimizations to the way that GraphicsMagick + and GhostScript are used to generate images from PDFs. We now delegate + directly to GhostScript, and only run GraphicsMagick on ten pages at a time, + before clearing out the temporary files. +

+

0.3.3
Start using the MAGICK_TMPDIR environment variable to prevent parallel diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 3b07658..bd47852 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -1,7 +1,7 @@ # The Docsplit module delegates to the Java PDF extractors. module Docsplit - VERSION = '0.3.3' # Keep in sync with gemspec. + VERSION = '0.3.4' # Keep in sync with gemspec. ROOT = File.expand_path(File.dirname(__FILE__) + '/..') diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 17bee57..de6ce9d 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -4,9 +4,10 @@ module Docsplit # nicely sized images. class ImageExtractor - DENSITY_ARG = "-density 150" - MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB" - DEFAULT_FORMAT = :png + DENSITY_ARG = "-density 100" + MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB" + GHOSTSCRIPT_ARGS = "-q -dBATCH -dMaxBitmap=50000000 -dNOPAUSE -sDEVICE=tiff24nc -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -r100x100" + DEFAULT_FORMAT = :png # Extract a list of PDFs as rasterized page images, according to the # configuration in options. @@ -27,18 +28,23 @@ def convert(pdf, size, format, previous=nil) tempdir = Dir.mktmpdir basename = File.basename(pdf, File.extname(pdf)) directory = directory_for(size) + pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s FileUtils.mkdir_p(directory) unless File.exists?(directory) + tiff_file = File.join(tempdir, "#{basename}.tif") out_file = File.join(directory, "#{basename}_%05d.#{format}") common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}" if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1" else - cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1" + cmd = "gs #{GHOSTSCRIPT_ARGS} -sOutputFile=#{tiff_file} -- #{pdf}" + page_list(pages, 10).each do |nums| + cmd += " && MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{tiff_file}#{pages_arg(nums)}\" \"#{out_file}\" 2>&1" + end end result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 - renumber_images(out_file, format) + renumber_images(pages, out_file, format) FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) end @@ -77,15 +83,13 @@ def quality_arg(format) end # Generate the requested page index into the document. - def pages_arg - return '' if @pages.nil? - pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s } - "[#{pages}]" + def pages_arg(numbers) + '[' + numbers.map {|num| num - 1 }.join(',') + ']' end # Generate the expanded list of requested page numbers. - def page_list - @pages.split(',').map { |range| + def page_list(pages, chunk_count=nil) + list = pages.split(',').map { |range| if range.include?('-') range = range.split('-') Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i } @@ -93,18 +97,25 @@ def page_list range.to_i end }.flatten.sort + return list unless chunk_count + chunks = [] + list.each_with_index do |num, i| + chunks << [] if i % chunk_count == 0 + chunks.last << num + end + chunks end # When GraphicsMagick is through, it will have generated a number of # incrementing page images, starting at 0. Renumber them with their correct # page numbers. - def renumber_images(template, format) + def renumber_images(pages, template, format) suffixer = /_0+(\d+)\.#{format}\Z/ images = Dir[template.sub('%05d', '0*')].map do |path| index = path[suffixer, 1].to_i {:path => path, :index => index, :page_number => index + 1} end - numbers = @pages ? page_list.reverse : nil + numbers = page_list(pages).reverse images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i| number = numbers ? numbers[i] : image[:page_number] FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}")) From 62794ed265fe0fba52977c3b19762edf4eb3b559 Mon Sep 17 00:00:00 2001 From: Jeremy Ashkenas Date: Fri, 20 Aug 2010 14:31:52 -0400 Subject: [PATCH 2/2] correct error in naming and renumbering 10-page chunks of images. --- lib/docsplit/image_extractor.rb | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index de6ce9d..396c832 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -8,6 +8,7 @@ class ImageExtractor MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB" GHOSTSCRIPT_ARGS = "-q -dBATCH -dMaxBitmap=50000000 -dNOPAUSE -sDEVICE=tiff24nc -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -r100x100" DEFAULT_FORMAT = :png + CHUNK_SIZE = 10 # Extract a list of PDFs as rasterized page images, according to the # configuration in options. @@ -31,20 +32,20 @@ def convert(pdf, size, format, previous=nil) pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s FileUtils.mkdir_p(directory) unless File.exists?(directory) tiff_file = File.join(tempdir, "#{basename}.tif") - out_file = File.join(directory, "#{basename}_%05d.#{format}") common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}" if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1" else cmd = "gs #{GHOSTSCRIPT_ARGS} -sOutputFile=#{tiff_file} -- #{pdf}" - page_list(pages, 10).each do |nums| + page_list(pages, CHUNK_SIZE).each_with_index do |nums, chunk| + out_file = File.join(directory, "#{basename}_chunk#{chunk}_%05d.#{format}") cmd += " && MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{tiff_file}#{pages_arg(nums)}\" \"#{out_file}\" 2>&1" end end result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 - renumber_images(pages, out_file, format) + renumber_images(pages, File.join(directory, basename + '*.' + format), format) unless previous FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) end @@ -109,10 +110,11 @@ def page_list(pages, chunk_count=nil) # When GraphicsMagick is through, it will have generated a number of # incrementing page images, starting at 0. Renumber them with their correct # page numbers. - def renumber_images(pages, template, format) - suffixer = /_0+(\d+)\.#{format}\Z/ - images = Dir[template.sub('%05d', '0*')].map do |path| - index = path[suffixer, 1].to_i + def renumber_images(pages, glob, format) + suffixer = /_chunk(\d+)_0+(\d+)\.#{format}\Z/ + images = Dir[glob].map do |path| + chunk = path[suffixer, 1].to_i + index = chunk * CHUNK_SIZE + path[suffixer, 2].to_i {:path => path, :index => index, :page_number => index + 1} end numbers = page_list(pages).reverse