documentcloud · tbk303 · Sep 22, 2015 · Feb 6, 2019 · Feb 6, 2019 · Mar 3, 2021
diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
@@ -33,22 +33,22 @@ def convert(pdf, size, format, previous=nil)
       directory = directory_for(size)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       escaped_pdf = ESCAPE[pdf]
-      FileUtils.mkdir_p(directory) unless File.exists?(directory)
+      FileUtils.mkdir_p(directory) unless File.exist?(directory)
       common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
-        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
+        result = `MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
         raise ExtractionFailed, result if $? != 0
       else
         page_list(pages).each do |page|
           out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
-          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
+          cmd = "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
           result = `#{cmd}`.chomp
           raise ExtractionFailed, result if $? != 0
         end
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
 
 

diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb
@@ -10,15 +10,15 @@ def extract(pdfs, opts)
       [pdfs].flatten.each do |pdf|
         pdf_name = File.basename(pdf, File.extname(pdf))
         page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
-        FileUtils.mkdir_p @output unless File.exists?(@output)
+        FileUtils.mkdir_p @output unless File.exist?(@output)
 
         cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
           "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
         else
           "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
         end
         result = `#{cmd}`.chomp
-        FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
+        FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
         raise ExtractionFailed, result if $? != 0
         result
       end

diff --git a/lib/docsplit/pdf_extractor.rb b/lib/docsplit/pdf_extractor.rb
@@ -78,7 +78,7 @@ def office_executable
       # raise an error if that path isn't valid, otherwise, add
       # it to the front of our search paths.
       if ENV['OFFICE_PATH']
-        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
+        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
         paths.unshift(ENV['OFFICE_PATH'])
       end
 
@@ -95,11 +95,11 @@ def office_executable
       # Search for the first suitable office executable
       # and short circuit an executable is found.
       paths.each do |path|
-        if File.exists? path
+        if File.exist? path
           @@executable ||= path unless File.directory? path
           path_pieces.each do |pieces|
             check_path = File.join(path, pieces)
-            @@executable ||= check_path if File.exists? check_path
+            @@executable ||= check_path if File.exist? check_path
           end
         end
         break if @@executable
@@ -116,7 +116,7 @@ def office_path
     # Convert documents to PDF.
     def extract(docs, opts)
       out = opts[:output] || '.'
-      FileUtils.mkdir_p out unless File.exists?(out)
+      FileUtils.mkdir_p out unless File.exist?(out)
       [docs].flatten.each do |doc|
         ext = File.extname(doc)
         basename = File.basename(doc, ext)

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -21,14 +21,16 @@ class TextExtractor
 
     MIN_TEXT_PER_PAGE = 100 # in bytes
 
+    TIMEOUT = '5m'
+
     def initialize
       @pages_to_ocr = []
     end
 
     # Extract text from a list of PDFs.
     def extract(pdfs, opts)
       extract_options opts
-      FileUtils.mkdir_p @output unless File.exists?(@output)
+      FileUtils.mkdir_p @output unless File.exist?(@output)
       [pdfs].flatten.each do |pdf|
         @pdf_name = File.basename(pdf, File.extname(pdf))
         pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
@@ -46,7 +48,7 @@ def extract(pdfs, opts)
     # Does a PDF have any text embedded?
     def contains_text?(pdf)
       fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
-      !fonts.match(NO_TEXT_DETECTED)
+      !fonts.scrub.match(NO_TEXT_DETECTED)
     end
 
     # Extract a page range worth of text from a PDF, directly.
@@ -66,21 +68,23 @@ def extract_from_ocr(pdf, pages)
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
-          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
+          run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
+          raise Docsplit::ExtractionFailed unless File.exist? escaped_tiff
           run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
-        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
+        run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
         #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
+        raise Docsplit::ExtractionFailed unless File.exist? escaped_tiff
         run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end