Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean pdffonts output to avoid invalid UTF-8 characters #134

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions lib/docsplit/image_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,22 @@ def convert(pdf, size, format, previous=nil)
directory = directory_for(size)
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
escaped_pdf = ESCAPE[pdf]
FileUtils.mkdir_p(directory) unless File.exists?(directory)
FileUtils.mkdir_p(directory) unless File.exist?(directory)
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
if previous
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
result = `MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
raise ExtractionFailed, result if $? != 0
else
page_list(pages).each do |page|
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
cmd = "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
end
end
ensure
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
end


Expand Down
4 changes: 2 additions & 2 deletions lib/docsplit/page_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ def extract(pdfs, opts)
[pdfs].flatten.each do |pdf|
pdf_name = File.basename(pdf, File.extname(pdf))
page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
FileUtils.mkdir_p @output unless File.exists?(@output)
FileUtils.mkdir_p @output unless File.exist?(@output)

cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
"pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
else
"pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
end
result = `#{cmd}`.chomp
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
raise ExtractionFailed, result if $? != 0
result
end
Expand Down
8 changes: 4 additions & 4 deletions lib/docsplit/pdf_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def office_executable
# raise an error if that path isn't valid, otherwise, add
# it to the front of our search paths.
if ENV['OFFICE_PATH']
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
paths.unshift(ENV['OFFICE_PATH'])
end

Expand All @@ -95,11 +95,11 @@ def office_executable
# Search for the first suitable office executable
# and short circuit an executable is found.
paths.each do |path|
if File.exists? path
if File.exist? path
@@executable ||= path unless File.directory? path
path_pieces.each do |pieces|
check_path = File.join(path, pieces)
@@executable ||= check_path if File.exists? check_path
@@executable ||= check_path if File.exist? check_path
end
end
break if @@executable
Expand All @@ -116,7 +116,7 @@ def office_path
# Convert documents to PDF.
def extract(docs, opts)
out = opts[:output] || '.'
FileUtils.mkdir_p out unless File.exists?(out)
FileUtils.mkdir_p out unless File.exist?(out)
[docs].flatten.each do |doc|
ext = File.extname(doc)
basename = File.basename(doc, ext)
Expand Down
14 changes: 9 additions & 5 deletions lib/docsplit/text_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,16 @@ class TextExtractor

MIN_TEXT_PER_PAGE = 100 # in bytes

TIMEOUT = '5m'

def initialize
@pages_to_ocr = []
end

# Extract text from a list of PDFs.
def extract(pdfs, opts)
extract_options opts
FileUtils.mkdir_p @output unless File.exists?(@output)
FileUtils.mkdir_p @output unless File.exist?(@output)
[pdfs].flatten.each do |pdf|
@pdf_name = File.basename(pdf, File.extname(pdf))
pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
Expand All @@ -46,7 +48,7 @@ def extract(pdfs, opts)
# Does a PDF have any text embedded?
def contains_text?(pdf)
fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
!fonts.match(NO_TEXT_DETECTED)
!fonts.scrub.match(NO_TEXT_DETECTED)
end

# Extract a page range worth of text from a PDF, directly.
Expand All @@ -66,21 +68,23 @@ def extract_from_ocr(pdf, pages)
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
escaped_tiff = ESCAPE[tiff]
file = "#{base_path}_#{page}"
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
raise Docsplit::ExtractionFailed unless File.exist? escaped_tiff
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
clean_text(file + '.txt') if @clean_ocr
FileUtils.remove_entry_secure tiff
end
else
tiff = "#{tempdir}/#{@pdf_name}.tif"
escaped_tiff = ESCAPE[tiff]
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
#if the user says don't do orientation detection or the plugin is not installed, set psm to 0
raise Docsplit::ExtractionFailed unless File.exist? escaped_tiff
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
clean_text(base_path + '.txt') if @clean_ocr
end
ensure
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
end


Expand Down