From 22aaed75be489493cb7e201f8997d94e496f3747 Mon Sep 17 00:00:00 2001
From: Jeremy Ashkenas <jashkenas@gmail.com>
Date: Fri, 20 Aug 2010 13:17:00 -0400
Subject: [PATCH 1/2] Attempting some GraphicsMagick / GhostScript memory and
 disk usage optimizations.

---
 docsplit.gemspec                |  4 ++--
 index.html                      | 10 ++++++++-
 lib/docsplit.rb                 |  2 +-
 lib/docsplit/image_extractor.rb | 37 +++++++++++++++++++++------------
 4 files changed, 36 insertions(+), 17 deletions(-)
diff --git a/docsplit.gemspec b/docsplit.gemspec
index 02c4bad..35318e4 100755
--- a/docsplit.gemspec
+++ b/docsplit.gemspec
@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.3.3'         # Keep version in sync with docsplit.rb
-  s.date      = '2010-8-17'
+  s.version   = '0.3.4'         # Keep version in sync with docsplit.rb
+  s.date      = '2010-8-20'
 
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
diff --git a/index.html b/index.html
index 5f1a292..23c21ad 100755
--- a/index.html
+++ b/index.html
@@ -98,7 +98,7 @@ <h1>Doc<sub style="font-size:150%;">&#9889;</sub>split</h1>
       (title, author, number of pages...)
     </p>
 
-    <p>Docsplit is currently at <a href="http://rubygems.org/gems/docsplit">version 0.3.3</a>.</p>
+    <p>Docsplit is currently at <a href="http://rubygems.org/gems/docsplit">version 0.3.4</a>.</p>
 
     <p>
       <i>Docsplit is an open-source component of <a href="http://documentcloud.org/">DocumentCloud</a>.</i>
@@ -279,6 +279,14 @@ <h2 id="internals">Internals</h2>
 
     <h2 id="changes">Change Log</h2>
     
+    <p>
+      <b class="header">0.3.4</b><br />
+      A number of Memory / Disk Space optimizations to the way that GraphicsMagick
+      and GhostScript are used to generate images from PDFs. We now delegate
+      directly to GhostScript, and only run GraphicsMagick on ten pages at a time,
+      before clearing out the temporary files.
+    </p>
+    
     <p>
       <b class="header">0.3.3</b><br />
       Start using the MAGICK_TMPDIR environment variable to prevent parallel
diff --git a/lib/docsplit.rb b/lib/docsplit.rb
index 3b07658..bd47852 100755
--- a/lib/docsplit.rb
+++ b/lib/docsplit.rb
@@ -1,7 +1,7 @@
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
 
-  VERSION       = '0.3.3' # Keep in sync with gemspec.
+  VERSION       = '0.3.4' # Keep in sync with gemspec.
 
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')
 
diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index 17bee57..de6ce9d 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -4,9 +4,10 @@ module Docsplit
   # nicely sized images.
   class ImageExtractor
 
-    DENSITY_ARG     = "-density 150"
-    MEMORY_ARGS     = "-limit memory 128MiB -limit map 256MiB"
-    DEFAULT_FORMAT  = :png
+    DENSITY_ARG       = "-density 100"
+    MEMORY_ARGS       = "-limit memory 256MiB -limit map 512MiB"
+    GHOSTSCRIPT_ARGS  = "-q -dBATCH -dMaxBitmap=50000000 -dNOPAUSE -sDEVICE=tiff24nc -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -r100x100"
+    DEFAULT_FORMAT    = :png
 
     # Extract a list of PDFs as rasterized page images, according to the
     # configuration in options.
@@ -27,18 +28,23 @@ def convert(pdf, size, format, previous=nil)
       tempdir   = Dir.mktmpdir
       basename  = File.basename(pdf, File.extname(pdf))
       directory = directory_for(size)
+      pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
+      tiff_file = File.join(tempdir, "#{basename}.tif")
       out_file  = File.join(directory, "#{basename}_%05d.#{format}")
       common    = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
         cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
       else
-        cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
+        cmd = "gs #{GHOSTSCRIPT_ARGS} -sOutputFile=#{tiff_file} -- #{pdf}"
+        page_list(pages, 10).each do |nums|
+          cmd += " && MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{tiff_file}#{pages_arg(nums)}\" \"#{out_file}\" 2>&1"
+        end
       end
       result = `#{cmd}`.chomp
       raise ExtractionFailed, result if $? != 0
-      renumber_images(out_file, format)
+      renumber_images(pages, out_file, format)
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
     end
 
@@ -77,15 +83,13 @@ def quality_arg(format)
     end
 
     # Generate the requested page index into the document.
-    def pages_arg
-      return '' if @pages.nil?
-      pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s }
-      "[#{pages}]"
+    def pages_arg(numbers)
+      '[' + numbers.map {|num| num - 1 }.join(',') + ']'
     end
 
     # Generate the expanded list of requested page numbers.
-    def page_list
-      @pages.split(',').map { |range|
+    def page_list(pages, chunk_count=nil)
+      list = pages.split(',').map { |range|
         if range.include?('-')
           range = range.split('-')
           Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
@@ -93,18 +97,25 @@ def page_list
           range.to_i
         end
       }.flatten.sort
+      return list unless chunk_count
+      chunks = []
+      list.each_with_index do |num, i|
+        chunks << [] if i % chunk_count == 0
+        chunks.last << num
+      end
+      chunks
     end
 
     # When GraphicsMagick is through, it will have generated a number of
     # incrementing page images, starting at 0. Renumber them with their correct
     # page numbers.
-    def renumber_images(template, format)
+    def renumber_images(pages, template, format)
       suffixer = /_0+(\d+)\.#{format}\Z/
       images = Dir[template.sub('%05d', '0*')].map do |path|
         index = path[suffixer, 1].to_i
         {:path => path, :index => index, :page_number => index + 1}
       end
-      numbers = @pages ? page_list.reverse : nil
+      numbers = page_list(pages).reverse
       images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i|
         number = numbers ? numbers[i] : image[:page_number]
         FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}"))

From 62794ed265fe0fba52977c3b19762edf4eb3b559 Mon Sep 17 00:00:00 2001
From: Jeremy Ashkenas <jashkenas@gmail.com>
Date: Fri, 20 Aug 2010 14:31:52 -0400
Subject: [PATCH 2/2] correct error in naming and renumbering 10-page chunks of
 images.

---
 lib/docsplit/image_extractor.rb | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index de6ce9d..396c832 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -8,6 +8,7 @@ class ImageExtractor
     MEMORY_ARGS       = "-limit memory 256MiB -limit map 512MiB"
     GHOSTSCRIPT_ARGS  = "-q -dBATCH -dMaxBitmap=50000000 -dNOPAUSE -sDEVICE=tiff24nc -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -r100x100"
     DEFAULT_FORMAT    = :png
+    CHUNK_SIZE        = 10
 
     # Extract a list of PDFs as rasterized page images, according to the
     # configuration in options.
@@ -31,20 +32,20 @@ def convert(pdf, size, format, previous=nil)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
       tiff_file = File.join(tempdir, "#{basename}.tif")
-      out_file  = File.join(directory, "#{basename}_%05d.#{format}")
       common    = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
         cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
       else
         cmd = "gs #{GHOSTSCRIPT_ARGS} -sOutputFile=#{tiff_file} -- #{pdf}"
-        page_list(pages, 10).each do |nums|
+        page_list(pages, CHUNK_SIZE).each_with_index do |nums, chunk|
+          out_file = File.join(directory, "#{basename}_chunk#{chunk}_%05d.#{format}")
           cmd += " && MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{tiff_file}#{pages_arg(nums)}\" \"#{out_file}\" 2>&1"
         end
       end
       result = `#{cmd}`.chomp
       raise ExtractionFailed, result if $? != 0
-      renumber_images(pages, out_file, format)
+      renumber_images(pages, File.join(directory, basename + '*.' + format), format) unless previous
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
     end
 
@@ -109,10 +110,11 @@ def page_list(pages, chunk_count=nil)
     # When GraphicsMagick is through, it will have generated a number of
     # incrementing page images, starting at 0. Renumber them with their correct
     # page numbers.
-    def renumber_images(pages, template, format)
-      suffixer = /_0+(\d+)\.#{format}\Z/
-      images = Dir[template.sub('%05d', '0*')].map do |path|
-        index = path[suffixer, 1].to_i
+    def renumber_images(pages, glob, format)
+      suffixer = /_chunk(\d+)_0+(\d+)\.#{format}\Z/
+      images = Dir[glob].map do |path|
+        chunk = path[suffixer, 1].to_i
+        index = chunk * CHUNK_SIZE + path[suffixer, 2].to_i
         {:path => path, :index => index, :page_number => index + 1}
       end
       numbers = page_list(pages).reverse