diff --git a/docsplit.gemspec b/docsplit.gemspec index 02c4bad..35318e4 100755 --- a/docsplit.gemspec +++ b/docsplit.gemspec @@ -1,7 +1,7 @@ Gem::Specification.new do |s| s.name = 'docsplit' - s.version = '0.3.3' # Keep version in sync with docsplit.rb - s.date = '2010-8-17' + s.version = '0.3.4' # Keep version in sync with docsplit.rb + s.date = '2010-8-20' s.homepage = "http://documentcloud.github.com/docsplit/" s.summary = "Break Apart Documents into Images, Text, Pages and PDFs" diff --git a/index.html b/index.html index 5f1a292..23c21ad 100755 --- a/index.html +++ b/index.html @@ -98,7 +98,7 @@

Docsplit

(title, author, number of pages...)

-

Docsplit is currently at version 0.3.3.

+

Docsplit is currently at version 0.3.4.

Docsplit is an open-source component of DocumentCloud. @@ -279,6 +279,14 @@

Internals

Change Log

+

+ 0.3.4
+ A number of Memory / Disk Space optimizations to the way that GraphicsMagick + and GhostScript are used to generate images from PDFs. We now delegate + directly to GhostScript, and only run GraphicsMagick on ten pages at a time, + before clearing out the temporary files. +

+

0.3.3
Start using the MAGICK_TMPDIR environment variable to prevent parallel diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 3b07658..bd47852 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -1,7 +1,7 @@ # The Docsplit module delegates to the Java PDF extractors. module Docsplit - VERSION = '0.3.3' # Keep in sync with gemspec. + VERSION = '0.3.4' # Keep in sync with gemspec. ROOT = File.expand_path(File.dirname(__FILE__) + '/..') diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 17bee57..396c832 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -4,9 +4,11 @@ module Docsplit # nicely sized images. class ImageExtractor - DENSITY_ARG = "-density 150" - MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB" - DEFAULT_FORMAT = :png + DENSITY_ARG = "-density 100" + MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB" + GHOSTSCRIPT_ARGS = "-q -dBATCH -dMaxBitmap=50000000 -dNOPAUSE -sDEVICE=tiff24nc -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -r100x100" + DEFAULT_FORMAT = :png + CHUNK_SIZE = 10 # Extract a list of PDFs as rasterized page images, according to the # configuration in options. @@ -27,18 +29,23 @@ def convert(pdf, size, format, previous=nil) tempdir = Dir.mktmpdir basename = File.basename(pdf, File.extname(pdf)) directory = directory_for(size) + pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s FileUtils.mkdir_p(directory) unless File.exists?(directory) - out_file = File.join(directory, "#{basename}_%05d.#{format}") + tiff_file = File.join(tempdir, "#{basename}.tif") common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}" if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1" else - cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1" + cmd = "gs #{GHOSTSCRIPT_ARGS} -sOutputFile=#{tiff_file} -- #{pdf}" + page_list(pages, CHUNK_SIZE).each_with_index do |nums, chunk| + out_file = File.join(directory, "#{basename}_chunk#{chunk}_%05d.#{format}") + cmd += " && MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{tiff_file}#{pages_arg(nums)}\" \"#{out_file}\" 2>&1" + end end result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 - renumber_images(out_file, format) + renumber_images(pages, File.join(directory, basename + '*.' + format), format) unless previous FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) end @@ -77,15 +84,13 @@ def quality_arg(format) end # Generate the requested page index into the document. - def pages_arg - return '' if @pages.nil? - pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s } - "[#{pages}]" + def pages_arg(numbers) + '[' + numbers.map {|num| num - 1 }.join(',') + ']' end # Generate the expanded list of requested page numbers. - def page_list - @pages.split(',').map { |range| + def page_list(pages, chunk_count=nil) + list = pages.split(',').map { |range| if range.include?('-') range = range.split('-') Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i } @@ -93,18 +98,26 @@ def page_list range.to_i end }.flatten.sort + return list unless chunk_count + chunks = [] + list.each_with_index do |num, i| + chunks << [] if i % chunk_count == 0 + chunks.last << num + end + chunks end # When GraphicsMagick is through, it will have generated a number of # incrementing page images, starting at 0. Renumber them with their correct # page numbers. - def renumber_images(template, format) - suffixer = /_0+(\d+)\.#{format}\Z/ - images = Dir[template.sub('%05d', '0*')].map do |path| - index = path[suffixer, 1].to_i + def renumber_images(pages, glob, format) + suffixer = /_chunk(\d+)_0+(\d+)\.#{format}\Z/ + images = Dir[glob].map do |path| + chunk = path[suffixer, 1].to_i + index = chunk * CHUNK_SIZE + path[suffixer, 2].to_i {:path => path, :index => index, :page_number => index + 1} end - numbers = @pages ? page_list.reverse : nil + numbers = page_list(pages).reverse images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i| number = numbers ? numbers[i] : image[:page_number] FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}"))