documentcloud · knowtheory · Aug 31, 2024 · May 5, 2023 · May 5, 2023
diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
@@ -33,7 +33,7 @@ def convert(pdf, size, format, previous=nil)
       directory = directory_for(size)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       escaped_pdf = ESCAPE[pdf]
-      FileUtils.mkdir_p(directory) unless File.exists?(directory)
+      FileUtils.mkdir_p(directory) unless File.exist?(directory)
       common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
@@ -48,7 +48,7 @@ def convert(pdf, size, format, previous=nil)
         end
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
 
 

diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb
@@ -10,15 +10,15 @@ def extract(pdfs, opts)
       [pdfs].flatten.each do |pdf|
         pdf_name = File.basename(pdf, File.extname(pdf))
         page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
-        FileUtils.mkdir_p @output unless File.exists?(@output)
-        
+        FileUtils.mkdir_p @output unless File.exist?(@output)
+
         cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
           "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
         else
           "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
         end
         result = `#{cmd}`.chomp
-        FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
+        FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
         raise ExtractionFailed, result if $? != 0
         result
       end

diff --git a/lib/docsplit/pdf_extractor.rb b/lib/docsplit/pdf_extractor.rb
@@ -16,7 +16,7 @@ def osx?
     def linux?
       !!HOST_OS.match(/linux/i)
     end
-    
+
     # The first line of the help output holds the name and version number
     # of the office software to be used for extraction.
     def version_string
@@ -35,10 +35,10 @@ def libre_office?
     def open_office?
       !!version_string.match(/^OpenOffice.org/)
     end
-    
+
     # A set of default locations to search for office software
     # These have been extracted from JODConverter.  Each listed
-    # path should contain a directory "program" which in turn 
+    # path should contain a directory "program" which in turn
     # contains the "soffice" executable.
     # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
     def office_search_paths
@@ -69,7 +69,7 @@ def office_search_paths
       end
       search_paths
     end
-    
+
     # Identify the path to a working office executable.
     def office_executable
       paths = office_search_paths
@@ -78,10 +78,10 @@ def office_executable
       # raise an error if that path isn't valid, otherwise, add
       # it to the front of our search paths.
       if ENV['OFFICE_PATH']
-        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
+        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
         paths.unshift(ENV['OFFICE_PATH'])
       end
-      
+
       # The location of the office executable is OS dependent
       path_pieces = ["soffice"]
       if windows?
@@ -91,32 +91,32 @@ def office_executable
       else
         path_pieces += [["program", "soffice"]]
       end
-      
+
       # Search for the first suitable office executable
       # and short circuit an executable is found.
       paths.each do |path|
-        if File.exists? path
+        if File.exist? path
           @@executable ||= path unless File.directory? path
           path_pieces.each do |pieces|
             check_path = File.join(path, pieces)
-            @@executable ||= check_path if File.exists? check_path
+            @@executable ||= check_path if File.exist? check_path
           end
         end
         break if @@executable
       end
       raise OfficeNotFound, "No office software found" unless @@executable
       @@executable
     end
-    
+
     # Used to specify the office location for JODConverter
     def office_path
       File.dirname(File.dirname(office_executable))
     end
-    
+
     # Convert documents to PDF.
     def extract(docs, opts)
       out = opts[:output] || '.'
-      FileUtils.mkdir_p out unless File.exists?(out)
+      FileUtils.mkdir_p out unless File.exist?(out)
       [docs].flatten.each do |doc|
         ext = File.extname(doc)
         basename = File.basename(doc, ext)
@@ -128,7 +128,7 @@ def extract(docs, opts)
           if libre_office?
             # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
             ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
-            
+
             options = "--headless --invisible  --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
             cmd = "#{office_executable} #{options} 2>&1"
             result = `#{cmd}`.chomp
@@ -147,9 +147,9 @@ def extract(docs, opts)
     LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
 
     HEADLESS      = "-Djava.awt.headless=true"
-    
+
     private
-    
+
     # Runs a Java command, with quieted logging, and the classpath set properly.
     def run_jod(command, pdfs, opts, return_output=false)
 

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -28,7 +28,7 @@ def initialize
     # Extract text from a list of PDFs.
     def extract(pdfs, opts)
       extract_options opts
-      FileUtils.mkdir_p @output unless File.exists?(@output)
+      FileUtils.mkdir_p @output unless File.exist?(@output)
       [pdfs].flatten.each do |pdf|
         @pdf_name = File.basename(pdf, File.extname(pdf))
         pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
@@ -80,7 +80,7 @@ def extract_from_ocr(pdf, pages)
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
 
 

diff --git a/test/test_helper.rb b/test/test_helper.rb
@@ -10,7 +10,7 @@ class Minitest::Test
   OUTPUT = 'test/output'
 
   def clear_output
-    FileUtils.rm_r(OUTPUT) if File.exists?(OUTPUT)
+    FileUtils.rm_r(OUTPUT) if File.exist?(OUTPUT)
   end
 
   def teardown