-
Notifications
You must be signed in to change notification settings - Fork 0
/
article_parser.rb
113 lines (93 loc) · 3.63 KB
/
article_parser.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
require 'nokogiri'
require 'down'
require 'fileutils'
require 'rack/mime'
require 'net/http'
require 'httparty'
module ArticleParser
extend self
def download(document)
# TODO: Could we use the print version should a site have one? Try World Hum.
# Find article content
if document.at_css("article")
article = document.at_css("article")
elsif document.at_css("main")
article = document.at_css("main")
elsif document.at_css('[id="content"]')
article = document.at_css('[id="content"]')
else
article = document.at_css("body")
end
# Remove unwanted HTML elements
article.search('aside', 'script', 'noscript', 'style', 'nav', 'video', 'form', 'button', 'fbs-ad', 'map').remove
# Remove unwanted elements by class/id
# TODO: make case insensitive to shorten file length
file = File.join Rails.root, 'lib', 'stop_words.txt'
File.readlines(file, chomp: true).each do |line|
article.xpath("//*[@*[contains(., '#{line}')]]").each do |node|
node.remove
end
end
return article
end
def images(full_directory_path, file_name, document)
# Tell nokogiri this is not a whole document
article = Nokogiri::HTML::DocumentFragment.parse(document)
count = 1
# Replace the src for downloaded images
article.css('img').each do |img|
# If the image source is `nil` or an empty string, check if there is a `data-src` et al
if img.attr('src').blank?
if img.attr('data-src').present?
img.set_attribute('src', img.attr('data-src'))
elsif img.attr('data-image').present?
img.set_attribute('src', img.attr('data-image'))
elsif img.attr('data-img').present?
img.set_attribute('src', img.attr('data-img'))
else
img.set_attribute('src', '')
no_image_found(full_directory_path, file_name, img, count)
count += 1
next
end
end
# Make sure the image isn't an svg added as `data:image`
if img.attr('src').include? 'data:image/svg+xml'
no_image_found(full_directory_path, file_name, img, count)
count += 1
next
end
# Set `url` from `src`
url = URI.parse(img.attr('src'))
# Check if there is some amazon ad nonsense
if url.host.include?('amazon-adsystem.com')
img.set_attribute('src', '')
no_image_found(full_directory_path, file_name, img, count)
count += 1
next
end
# Download the image with Down gem
if HTTParty.get(url).response.code == '200' # Check that the image is avaliable (no 404s)
image = Down.download(img.attr('src'))
# Get the file extention
image_type = Rack::Mime::MIME_TYPES.invert[image.content_type]
# Rename the file for those idiots who like to string URLs together and break the internet
image_name = "#{count.to_words}" + "#{image_type}"
# Move the file to the appropriate directory
FileUtils.mv(image.path, "#{full_directory_path}/#{file_name}/#{image_name}")
# Update the `img` tag in the article body
img.attributes['src'].value = "#{file_name}/#{image_name}"
else
no_image_found(full_directory_path, file_name, img, count)
end
count += 1
end
return article
end
def no_image_found(full_directory_path, file_name, img, count)
# Copy the "no-image" file to the appropriate directory
FileUtils.cp("app/assets/images/no-image.jpg", "#{full_directory_path}/#{file_name}/#{count.to_words}.jpg")
# Update the `img` tag in the article body
img.attributes['src'].value = "#{file_name}/#{count.to_words}.jpg"
end
end