forked from andrew-d/goscrape
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.go
287 lines (241 loc) · 8.38 KB
/
scrape.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
package scrape
import (
"errors"
"fmt"
"github.com/PuerkitoBio/goquery"
)
var (
ErrNoPieces = errors.New("no pieces in the config")
)
// The DividePageFunc type is used to extract a page's blocks during a scrape.
// For more information, please see the documentation on the ScrapeConfig type.
type DividePageFunc func(*goquery.Selection) []*goquery.Selection
// The PieceExtractor interface represents something that can extract data from
// a selection.
type PieceExtractor interface {
// Extract some data from the given Selection and return it. The returned
// data should be encodable - i.e. passing it to json.Marshal should succeed.
// If the returned data is nil, then the output from this piece will not be
// included.
//
// If this function returns an error, then the scrape is aborted.
Extract(*goquery.Selection) (interface{}, error)
}
// The Paginator interface should be implemented by things that can retrieve the
// next page from the current one.
type Paginator interface {
// NextPage controls the progress of the scrape. It is called for each input
// page, starting with the origin URL, and is expected to return the URL of
// the next page to process. Note that order matters - calling 'NextPage' on
// page 1 should return page 2, not page 3. The function should return an
// empty string when there are no more pages to process.
NextPage(url string, document *goquery.Selection) (string, error)
// TODO(andrew-d): should this return a string, a url.URL, ???
}
// A Piece represents a given chunk of data that is to be extracted from every
// block in each page of a scrape.
type Piece struct {
// The name of this piece. Required, and will be used to aggregate results.
Name string
// A sub-selector within the given block to process. Pass in "." to use
// the root block's selector with no modification.
Selector string
// TODO(andrew-d): Consider making this an interface too.
// Extractor contains the logic on how to extract some results from the
// selector that is provided to this Piece.
Extractor PieceExtractor
}
// The main configuration for a scrape. Pass this to the New() function.
type ScrapeConfig struct {
// Fetcher is the underlying transport that is used to fetch documents.
// If this is not specified (i.e. left nil), then a default HttpClientFetcher
// will be created and used.
Fetcher Fetcher
// Paginator is the Paginator to use for this current scrape.
//
// If Paginator is nil, then no pagination is performed and it is assumed that
// the initial URL is the only page.
Paginator Paginator
// DividePage splits a page into individual 'blocks'. When scraping, we treat
// each page as if it contains some number of 'blocks', each of which can be
// further subdivided into what actually needs to be extracted.
//
// If the DividePage function is nil, then no division is performed and the
// page is assumed to contain a single block containing the entire <body>
// element.
DividePage DividePageFunc
// Pieces contains the list of data that is extracted for each block. For
// every block that is the result of the DividePage function (above), all of
// the Pieces entries receives the selector representing the block, and can
// return a result. If the returned result is nil, then the Piece is
// considered not to exist in this block, and is not included.
//
// Note: if a Piece's Extractor returns an error, it results in the scrape
// being aborted - this can be useful if you need to ensure that a given Piece
// is required, for example.
Pieces []Piece
}
func (c *ScrapeConfig) clone() *ScrapeConfig {
ret := &ScrapeConfig{
Fetcher: c.Fetcher,
Paginator: c.Paginator,
DividePage: c.DividePage,
Pieces: c.Pieces,
}
return ret
}
// ScrapeResults describes the results of a scrape. It contains a list of all
// pages (URLs) visited during the process, along with all results generated
// from each Piece in each page.
type ScrapeResults struct {
// All URLs visited during this scrape, in order. Always contains at least
// one element - the initial URL.
URLs []string
// The results from each Piece of each page. Essentially, the top-level array
// is for each page, the second-level array is for each block in a page, and
// the final map[string]interface{} is the mapping of Piece.Name to results.
Results [][]map[string]interface{}
}
// First returns the first set of results - i.e. the results from the first
// block on the first page.
//
// This function can return nil if there were no blocks found on the first page
// of the scrape.
func (r *ScrapeResults) First() map[string]interface{} {
if len(r.Results[0]) == 0 {
return nil
}
return r.Results[0][0]
}
// AllBlocks returns a single list of results from every block on all pages.
// This function will always return a list, even if no blocks were found.
func (r *ScrapeResults) AllBlocks() []map[string]interface{} {
ret := []map[string]interface{}{}
for _, page := range r.Results {
for _, block := range page {
ret = append(ret, block)
}
}
return ret
}
type Scraper struct {
config *ScrapeConfig
}
// Create a new scraper with the provided configuration.
func New(c *ScrapeConfig) (*Scraper, error) {
var err error
// Validate config
if len(c.Pieces) == 0 {
return nil, ErrNoPieces
}
seenNames := map[string]struct{}{}
for i, piece := range c.Pieces {
if len(piece.Name) == 0 {
return nil, fmt.Errorf("no name provided for piece %d", i)
}
if _, seen := seenNames[piece.Name]; seen {
return nil, fmt.Errorf("piece %d has a duplicate name", i)
}
seenNames[piece.Name] = struct{}{}
if len(piece.Selector) == 0 {
return nil, fmt.Errorf("no selector provided for piece %d", i)
}
}
// Clone the configuration and fill in the defaults.
config := c.clone()
if config.Paginator == nil {
config.Paginator = dummyPaginator{}
}
if config.DividePage == nil {
config.DividePage = DividePageBySelector("body")
}
if config.Fetcher == nil {
config.Fetcher, err = NewHttpClientFetcher()
if err != nil {
return nil, err
}
}
// All set!
ret := &Scraper{
config: config,
}
return ret, nil
}
// Scrape a given URL with default options. See 'ScrapeWithOpts' for more
// information.
func (s *Scraper) Scrape(url string) (*ScrapeResults, error) {
return s.ScrapeWithOpts(url, DefaultOptions)
}
// Actually start scraping at the given URL.
//
// Note that, while this function and the Scraper in general are safe for use
// from multiple goroutines, making multiple requests in parallel can cause
// strange behaviour - e.g. overwriting cookies in the underlying http.Client.
// Please be careful when running multiple scrapes at a time, unless you know
// that it's safe.
func (s *Scraper) ScrapeWithOpts(url string, opts ScrapeOptions) (*ScrapeResults, error) {
if len(url) == 0 {
return nil, errors.New("no URL provided")
}
// Prepare the fetcher.
err := s.config.Fetcher.Prepare()
if err != nil {
return nil, err
}
res := &ScrapeResults{
URLs: []string{},
Results: [][]map[string]interface{}{},
}
var numPages int
for {
// Repeat until we don't have any more URLs, or until we hit our page limit.
if len(url) == 0 || (opts.MaxPages > 0 && numPages >= opts.MaxPages) {
break
}
resp, err := s.config.Fetcher.Fetch("GET", url)
if err != nil {
return nil, err
}
// Create a goquery document.
doc, err := goquery.NewDocumentFromReader(resp)
resp.Close()
if err != nil {
return nil, err
}
res.URLs = append(res.URLs, url)
results := []map[string]interface{}{}
// Divide this page into blocks
for _, block := range s.config.DividePage(doc.Selection) {
blockResults := map[string]interface{}{}
// Process each piece of this block
for _, piece := range s.config.Pieces {
sel := block
if piece.Selector != "." {
sel = sel.Find(piece.Selector)
}
pieceResults, err := piece.Extractor.Extract(sel)
if err != nil {
return nil, err
}
// A nil response from an extractor means that we don't even include it in
// the results.
if pieceResults == nil {
continue
}
blockResults[piece.Name] = pieceResults
}
// Append the results from this block.
results = append(results, blockResults)
}
// Append the results from this page.
res.Results = append(res.Results, results)
numPages++
// Get the next page.
url, err = s.config.Paginator.NextPage(url, doc.Selection)
if err != nil {
return nil, err
}
}
// All good!
return res, nil
}