From 29787e0912c63f0b092745026b8ae5b5359d5cad Mon Sep 17 00:00:00 2001 From: loyalsoldier <10487845+Loyalsoldier@users.noreply.github.com> Date: Sun, 11 Apr 2021 21:27:41 +0800 Subject: [PATCH] Feat: refactor --- common.go | 151 ---------------- crawler/crawl.go | 31 ++-- go.mod | 3 +- go.sum | 8 +- main.go | 139 ++++----------- parser/parse.go | 40 ----- program.go | 453 +++++++++++++++++++++++++++++++++++++++++++++++ tree.go | 4 +- utils/cpu.go | 20 --- utils/error.go | 11 +- 10 files changed, 516 insertions(+), 344 deletions(-) delete mode 100644 parser/parse.go create mode 100644 program.go delete mode 100644 utils/cpu.go diff --git a/common.go b/common.go index 70cb580..86e0fd2 100644 --- a/common.go +++ b/common.go @@ -1,163 +1,12 @@ package main import ( - "bufio" - "compress/gzip" - "fmt" "log" - "os" - "regexp" - "sort" - "strconv" "strings" - "sync" - "github.com/Loyalsoldier/cn-blocked-domain/crawler" - "github.com/Loyalsoldier/cn-blocked-domain/parser" "github.com/Loyalsoldier/cn-blocked-domain/utils" - "github.com/PuerkitoBio/goquery" - "github.com/matryer/try" ) -// GetMaxPage gets the max page of crawl type -func GetMaxPage(initURLSlice map[*CrawlType]string, initElem, initHrefElem string) { - for crawlType, initURL := range initURLSlice { - ungzipData, err := crawler.Crawl(initURL, "https://zh.greatfire.org") - if err != nil { - log.Fatal(err) - } - defer ungzipData.Close() - - // Load the HTML document - doc, err := goquery.NewDocumentFromReader(ungzipData) - - // Find items - doc.Find(initElem).Each(func(i int, s *goquery.Selection) { - // For each item found, get contents - lastPageHref, exists := s.Find(initHrefElem).Attr("href") - if exists { - matchList := strings.Split(lastPageHref, "?page=") - if len(matchList) > 0 { - maxPage := matchList[1] - crawlType.GreatFireURL.MaxPage, _ = strconv.Atoi(maxPage) - log.Printf("%s has %s pages\n", initURL, maxPage) - } - } else { - log.Printf("Failed to get the max page of %s\n", initURL) - } - }) - } -} - -// ControlFlow controls the crawl process -func ControlFlow(crawlItems []string, outChan chan map[string]int, elem, uElem, bElem string, retryTimes, numCPUs int) { - var wg sync.WaitGroup - maxGoRoutinesChan := make(chan struct{}, numCPUs) - - for _, url := range crawlItems { - // Decrement the remaining space for max GoRoutines parallelism - maxGoRoutinesChan <- struct{}{} - // Increment the WaitGroup counter - wg.Add(1) - go CrawlAndProcessPage(url, outChan, &wg, maxGoRoutinesChan, elem, uElem, bElem, retryTimes) - } - - // Wait for all goroutines to complete - wg.Wait() - - close(outChan) -} - -// CrawlAndProcessPage crawls a URL page and processes it -func CrawlAndProcessPage(url string, outChan chan map[string]int, wg *sync.WaitGroup, maxGoRoutinesChan chan struct{}, elem, uElem, bElem string, retryTimes int) { - defer func() { - if err := recover(); err != nil { - log.Printf("Goroutine panic: fetching %v : %v\n", url, err) - } - }() - - var ungzipData *gzip.Reader - err := try.Do(func(attempt int) (retry bool, err error) { - retry = attempt < retryTimes - defer func() { - if r := recover(); r != nil { - err = fmt.Errorf("panic: %v", r) - } - }() - - if attempt > 1 { - log.Println(utils.Fatal(attempt), "time, crawling URL:", utils.Info(url)) - } else { - log.Println(utils.Warning(attempt), "time, crawling URL:", utils.Info(url)) - } - - ungzipData, err = crawler.Crawl(url, "https://zh.greatfire.org") - utils.CheckError(err) - return - }) - utils.CheckError(err) - defer ungzipData.Close() - - parser.HTMLParser(outChan, ungzipData, elem, uElem, bElem) - - // Decrement the counter when the goroutine completes - defer wg.Done() - // Indicate that there is one free space in goroutine list - <-maxGoRoutinesChan -} - -// ValidateAndWrite filters urlMap from resultChan and writes it to files -func ValidateAndWrite(resultChan chan map[string]int, filteredFile, rawFile, re, reForIP string, percentStd int) { - defer func() { - if err := recover(); err != nil { - log.Printf("Runtime panic: %v\n", err) - } - }() - - f, err := os.OpenFile(rawFile, os.O_WRONLY|os.O_CREATE, 0644) - utils.CheckError(err) - defer f.Close() - - g, err := os.OpenFile(filteredFile, os.O_WRONLY|os.O_CREATE, 0644) - utils.CheckError(err) - defer g.Close() - - var resultMap Results = make(map[string]struct{}) - for result := range resultChan { - for url, percent := range result { - url = strings.ToLower(url) - // Write raw result to raw.txt file - w := bufio.NewWriter(f) - w.WriteString(fmt.Sprintf("%s | %d\n", url, percent)) - w.Flush() - - if percent >= percentStd { - var domain string - reg := regexp.MustCompile(re) - matchList := reg.FindStringSubmatch(url) - - if len(matchList) > 0 { - domain = matchList[len(matchList)-2] - // Write filtered result to console - fmt.Printf("%s | %d\n", domain, percent) - // Write filtered result to Results type variable resultMap - resultMap[domain] = struct{}{} - } - } - } - } - - resultSlice := resultMap.SortAndUnique(reForIP) - sort.Strings(resultSlice) - - x := bufio.NewWriter(g) - for _, domain := range resultSlice { - // Write filtered result to temp-domains.txt file - x.WriteString(domain + "\n") - } - x.Flush() -} - func buildTreeAndUnique(sortedDomainList []string) []string { tree := newList() remainList := make([]string, 0, len(sortedDomainList)) diff --git a/crawler/crawl.go b/crawler/crawl.go index 0c3a1d4..3d93cb1 100644 --- a/crawler/crawl.go +++ b/crawler/crawl.go @@ -1,45 +1,50 @@ package crawler import ( - "compress/gzip" + "errors" "net/http" + "net/url" "runtime" + "strconv" ) func genUA() (userAgent string) { switch runtime.GOOS { case "linux": - userAgent = `Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0` + userAgent = `Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0` case "darwin": userAgent = `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36` case "windows": - userAgent = `Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0` + userAgent = `Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0` } return } // Crawl crawls webpage content and returns *gzip.Reader -func Crawl(url, ref string) (*gzip.Reader, error) { - req, err := http.NewRequest("GET", url, nil) +func Crawl(target, referer string) (*http.Response, error) { + if _, err := url.Parse(target); err != nil { + return nil, err + } + req, err := http.NewRequest(http.MethodGet, target, nil) if err != nil { return nil, err } - var ua = genUA() - req.Header.Set("User-Agent", ua) - req.Header.Set("Referer", ref) + req.Header.Set("User-Agent", genUA()) + req.Header.Set("Referer", referer) req.Header.Set("Accept-Encoding", "gzip") req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9") - res, err := http.DefaultClient.Do(req) + var client http.Client + resp, err := client.Do(req) if err != nil { return nil, err } - ungzipData, err := gzip.NewReader(res.Body) - if err != nil { - return nil, err + if resp.StatusCode != http.StatusOK { + return nil, errors.New("bad status code: " + strconv.Itoa(resp.StatusCode)) } - return ungzipData, nil + + return resp, nil } diff --git a/go.mod b/go.mod index 9201c07..92a1ddc 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,5 @@ go 1.16 require ( github.com/PuerkitoBio/goquery v1.6.1 - github.com/cheekybits/is v0.0.0-20150225183255-68e9c0620927 // indirect - github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2 + gopkg.in/yaml.v2 v2.4.0 ) diff --git a/go.sum b/go.sum index a78cb0e..90416d3 100644 --- a/go.sum +++ b/go.sum @@ -2,13 +2,13 @@ github.com/PuerkitoBio/goquery v1.6.1 h1:FgjbQZKl5HTmcn4sKBgvx8vv63nhyhIpv7lJpFG github.com/PuerkitoBio/goquery v1.6.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= -github.com/cheekybits/is v0.0.0-20150225183255-68e9c0620927 h1:SKI1/fuSdodxmNNyVBR8d7X/HuLnRpvvFO0AgyQk764= -github.com/cheekybits/is v0.0.0-20150225183255-68e9c0620927/go.mod h1:h/aW8ynjgkuj+NQRlZcDbAbM1ORAbXjXX77sX7T289U= -github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2 h1:JAEbJn3j/FrhdWA9jW8B5ajsLIjeuEHLi8xE4fk997o= -github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2/go.mod h1:0KeJpeMD6o+O4hW7qJOT7vyQPKrWmj26uf5wMc/IiIs= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= diff --git a/main.go b/main.go index f0aaa2a..f1956a5 100644 --- a/main.go +++ b/main.go @@ -1,127 +1,48 @@ package main import ( - "fmt" - "regexp" - "sort" - "strconv" - "strings" - "sync" - - "github.com/Loyalsoldier/cn-blocked-domain/utils" -) - -const ( - initElem = "ul.pager" - initHrefElem = ".pager-last.last a" - elem = "table.gf-header tbody tr" - uElem = "td.first a" - bElem = "td.blocked" - re = `^\/(https?\/)?([a-zA-Z0-9][-_a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-_a-zA-Z0-9]{0,62})+)$` - reForIP = `(([0-9]{1,3}\.){3}[0-9]{1,3})` - rawFile = "raw.txt" - filteredFile = "domains.txt" - percentStd = 50 // set the min percent to filter domains - retryTimes = 3 // set crawler max retry times - maxCap = 100 * 16 // set the capacity of channel to contain results + "flag" + "log" + "os" ) -// Done implies whether the URL has been crawled or not -type Done bool +var configFile = flag.String("c", "config.yaml", "Path to the configuration file, supports YAML and JSON.") -// GreatFireURL defines the structure of the format of URL -type GreatFireURL struct { - BaseURL string - MiddleURL string - SuffixURL string - MaxPage int +func init() { + flag.Parse() } -// CrawlType defines the structure of AlexaTop1000 type of URLs and list -type CrawlType struct { - GreatFireURL *GreatFireURL - URLList []string - mux sync.RWMutex -} +func main() { + rawConfig := new(RawConfig) + config := new(Config) -// NewURLList returns the URL list to be crawled -func (c *CrawlType) NewURLList() { - c.mux.Lock() - c.URLList = make([]string, 0) - for i := 0; i < c.GreatFireURL.MaxPage; i++ { - fullURL := c.GreatFireURL.BaseURL + c.GreatFireURL.MiddleURL + c.GreatFireURL.SuffixURL + strconv.Itoa(i) - c.URLList = append(c.URLList, fullURL) + if err := rawConfig.ParseRawConfig(*configFile); err != nil { + log.Fatal(err) + os.Exit(1) } - defer c.mux.Unlock() -} - -// Results defines the structure of domain result map -type Results map[string]struct{} -// SortAndUnique filters the Results slice -func (r Results) SortAndUnique(reForIP string) []string { - resultSlice := make([]string, 0, len(r)) - reg := regexp.MustCompile(reForIP) - for domainKey := range r { - if len(reg.FindStringSubmatch(domainKey)) > 0 { - continue - } - resultSlice = append(resultSlice, domainKey) + if err := config.GenerateConfig(rawConfig); err != nil { + log.Fatal(err) + os.Exit(2) } - sort.SliceStable(resultSlice, func(i, j int) bool { - return len(strings.Split(resultSlice[i], ".")) < len(strings.Split(resultSlice[j], ".")) - }) - return buildTreeAndUnique(resultSlice) -} - -func main() { - orginalCPUs, numCPUs := utils.SetGOMAXPROCS() - - fmt.Println("CPU cores: ", utils.Info(orginalCPUs)) - fmt.Println("Go Processors: ", utils.Info(numCPUs)) - - alexaTop1000 := &CrawlType{ - GreatFireURL: &GreatFireURL{ - BaseURL: "https://zh.greatfire.org/search/", - MiddleURL: "alexa-top-1000-domains", - SuffixURL: "?page="}} - - blocked := &CrawlType{ - GreatFireURL: &GreatFireURL{ - BaseURL: "https://zh.greatfire.org/search/", - MiddleURL: "blocked", - SuffixURL: "?page="}} - - domains := &CrawlType{ - GreatFireURL: &GreatFireURL{ - BaseURL: "https://zh.greatfire.org/search/", - MiddleURL: "domains", - SuffixURL: "?page="}} - - initURLSlice := make(map[*CrawlType]string) - initURLSlice[alexaTop1000] = "https://zh.greatfire.org/search/alexa-top-1000-domains?page=0" - initURLSlice[blocked] = "https://zh.greatfire.org/search/blocked?page=0" - initURLSlice[domains] = "https://zh.greatfire.org/search/domains?page=0" - - // Get CrawlType max page - GetMaxPage(initURLSlice, initElem, initHrefElem) - - // Generates each type's URLList - alexaTop1000.NewURLList() - blocked.NewURLList() - domains.NewURLList() + if err := config.SetNumCPU(); err != nil { + log.Fatal(err) + os.Exit(3) + } - // Generate items to be crawled - crawlItems := make([]string, 0) - for crawlType := range initURLSlice { - for _, url := range crawlType.URLList { - crawlItems = append(crawlItems, url) - } + for err := range config.CrawlMaxPage() { + log.Fatal(err) + os.Exit(4) } - resultChan := make(chan map[string]int, maxCap) + if err := config.GenerateCrawlList(); err != nil { + log.Fatal(err) + os.Exit(5) + } - go ControlFlow(crawlItems, resultChan, elem, uElem, bElem, retryTimes, numCPUs) - ValidateAndWrite(resultChan, filteredFile, rawFile, re, reForIP, percentStd) + maxCap := config.Customize.MaxCapacity + rawResultChan := make(chan map[*string]int, maxCap) + go config.Crawl(rawResultChan) + config.FilterAndWrite(rawResultChan) } diff --git a/parser/parse.go b/parser/parse.go deleted file mode 100644 index 17a3c84..0000000 --- a/parser/parse.go +++ /dev/null @@ -1,40 +0,0 @@ -package parser - -import ( - "compress/gzip" - "log" - "strconv" - - "github.com/Loyalsoldier/cn-blocked-domain/utils" - "github.com/PuerkitoBio/goquery" -) - -// HTMLParser parses webpage content and sends URL & percent map to channel -func HTMLParser(resultChan chan map[string]int, data *gzip.Reader, elem, uElem, bElem string) { - defer func() { - if err := recover(); err != nil { - log.Printf("Runtime panic: %v\n", err) - } - }() - - // Load the HTML document - doc, err := goquery.NewDocumentFromReader(data) - utils.CheckError(err) - - // Find items - doc.Find(elem).Each(func(i int, s *goquery.Selection) { - // For each item found, get contents - url, _ := s.Find(uElem).Attr("href") - bPerStr := s.Find(bElem).Text() - - var blockPerNum, percent int - if bPerStr != "" { - blockPerNum, _ = strconv.Atoi(bPerStr[:len(bPerStr)-1]) - percent = blockPerNum - } - - result := make(map[string]int) - result[url] = percent - resultChan <- result - }) -} diff --git a/program.go b/program.go new file mode 100644 index 0000000..d7cc32c --- /dev/null +++ b/program.go @@ -0,0 +1,453 @@ +package main + +import ( + "bufio" + "compress/gzip" + "encoding/json" + "errors" + "fmt" + "log" + "os" + "path/filepath" + "regexp" + "runtime" + "sort" + "strconv" + "strings" + "sync" + + "github.com/PuerkitoBio/goquery" + "gopkg.in/yaml.v2" + + "github.com/Loyalsoldier/cn-blocked-domain/crawler" + "github.com/Loyalsoldier/cn-blocked-domain/utils" +) + +var ( + ErrConfigFormatNotSupported = errors.New("config format not supported") + ErrConfigIsEmpty = errors.New("config is empty") + ErrCrawlConfigIsEmpty = errors.New("crawl config is empty") + ErrFilterConfigIsEmpty = errors.New("filter config is empty") + ErrCustomizeConfigIsEmpty = errors.New("Customize config is empty") + ErrInvalidPageNumber = errors.New("invalid page number") +) + +type URL struct { + BaseURL string `yaml:"base_url,omitempty" json:"base_url,omitempty"` + InitSuffixURL string `yaml:"init_suffix_url,omitempty" json:"init_suffix_url,omitempty"` + SuffixURL string `yaml:"suffix_url,omitempty" json:"suffix_url,omitempty"` +} + +type Type struct { + Name string `yaml:"name,omitempty" json:"name,omitempty"` + TypeURL string `yaml:"type_url,omitempty" json:"type_url,omitempty"` + Referer string `yaml:"referer,omitempty" json:"referer,omitempty"` + IsCrawl bool `yaml:"is_crawl,omitempty" json:"is_crawl,omitempty"` + From int `yaml:"from,omitempty" json:"from,omitempty"` + To int `yaml:"to,omitempty" json:"to,omitempty"` +} + +type Elem struct { + Container string `yaml:"container,omitempty" json:"container,omitempty"` + Content string `yaml:"content,omitempty" json:"content,omitempty"` + Condition string `yaml:"condition,omitempty" json:"condition,omitempty"` + Attr string `yaml:"attr,omitempty" json:"attr,omitempty"` + Splitter string `yaml:"splitter,omitempty" json:"splitter,omitempty"` +} + +type Crawl struct { + *URL + Types []*Type `yaml:"types,omitempty" json:"types,omitempty"` + InitElement *Elem `yaml:"init_element,omitempty" json:"init_element,omitempty"` + CrawlElement *Elem `yaml:"crawl_element,omitempty" json:"crawl_element,omitempty"` +} + +type FilterType struct { + Domain string `yaml:"domain,omitempty" json:"domain,omitempty"` + IP string `yaml:"ip,omitempty" json:"ip,omitempty"` +} + +type Filter struct { + Regexp *FilterType `yaml:"regexp,omitempty" json:"regexp,omitempty"` + Percent int `yaml:"percent,omitempty" json:"percent,omitempty"` +} + +type Customize struct { + CPUCores int `yaml:"cpu_cores,omitempty" json:"cpu_cores,omitempty"` + MaxCapacity int `yaml:"max_capacity,omitempty" json:"max_capacity,omitempty"` + OutputDir string `yaml:"output_dir,omitempty" json:"output_dir,omitempty"` + RawFilename string `yaml:"raw_filename,omitempty" json:"raw_filename,omitempty"` + DomainFilename string `yaml:"domain_filename,omitempty" json:"domain_filename,omitempty"` + IPFilename string `yaml:"ip_filename,omitempty" json:"ip_filename,omitempty"` +} + +// RawConfig defines configuration from config files +type RawConfig struct { + *Crawl + *Filter + *Customize +} + +func (r *RawConfig) ParseRawConfig(configFile string) error { + switch { + case strings.HasSuffix(configFile, ".yaml"), strings.HasSuffix(configFile, ".yml"): + configBytes, err := os.ReadFile(configFile) + if err != nil { + return err + } + if err := yaml.Unmarshal(configBytes, &r); err != nil { + return err + } + case strings.HasSuffix(configFile, ".json"): + configBytes, err := json.Marshal(configFile) + if err != nil { + return err + } + if err := json.Unmarshal(configBytes, &r); err != nil { + return err + } + default: + return ErrConfigFormatNotSupported + } + return nil +} + +// GreatFireURL defines the structure of the format of URL +type GreatFireURL struct { + BaseURL string + TypeURL string + SuffixURL string + InitSuffixURL string +} + +// CrawlType defines the structure of AlexaTop1000 type of URLs and list +type CrawlType struct { + *GreatFireURL + Name string + IsCrawl bool + MaxPage int + From, To int + InitElement *Elem + CrawlElement *Elem + CrawlReferer string + CrawlList []string +} + +// Config defines the real configuration used in the program +type Config struct { + *Filter + *Customize + Types []*CrawlType +} + +// GenerateConfig generates raw config to config that can be used in the program +func (c *Config) GenerateConfig(r *RawConfig) error { + if r != nil { + if r.Filter != nil { + c.Filter = r.Filter + } else { + return ErrFilterConfigIsEmpty + } + if r.Customize != nil { + c.Customize = r.Customize + } else { + return ErrCustomizeConfigIsEmpty + } + if r.Crawl != nil && r.Crawl.Types != nil { + c.Types = make([]*CrawlType, len(r.Crawl.Types)) + for i := 0; i < len(r.Crawl.Types); i++ { + rawType := r.Crawl.Types[i] + c.Types[i] = &CrawlType{ + GreatFireURL: &GreatFireURL{ + BaseURL: r.Crawl.URL.BaseURL, + TypeURL: rawType.TypeURL, + SuffixURL: r.Crawl.URL.SuffixURL, + InitSuffixURL: r.Crawl.URL.InitSuffixURL, + }, + Name: rawType.Name, + IsCrawl: rawType.IsCrawl, + From: rawType.From, + To: rawType.To, + InitElement: r.Crawl.InitElement, + CrawlElement: r.Crawl.CrawlElement, + CrawlReferer: rawType.Referer, + } + } + return nil + } else { + return ErrCrawlConfigIsEmpty + } + } + return ErrConfigIsEmpty +} + +// SetNumCPU sets the maximum number of Goroutines +func (c *Config) SetNumCPU() error { + if c.Customize != nil { + setNum := c.Customize.CPUCores + originalNumCPU := runtime.NumCPU() + log.Println("Original CPU cores:", originalNumCPU) + + if setNum > originalNumCPU { + runtime.GOMAXPROCS(setNum) + log.Println("Now CPU cores:", setNum) + return nil + } + switch { + case originalNumCPU == 1: + originalNumCPU = 3 + case originalNumCPU == 2: + originalNumCPU *= 3 + case originalNumCPU == 3: + originalNumCPU *= 2 + case originalNumCPU == 4: + originalNumCPU = 10 + default: + originalNumCPU += int(0.5 * float64(originalNumCPU)) + } + runtime.GOMAXPROCS(originalNumCPU) + c.Customize.CPUCores = originalNumCPU + log.Println("Now CPU cores:", originalNumCPU) + return nil + } else { + return ErrCustomizeConfigIsEmpty + } +} + +// CrawlMaxPage gets the max page of crawl type +func (c *Config) CrawlMaxPage() chan error { + var wg sync.WaitGroup + wg.Add(len(c.Types)) + + e := make(chan error, len(c.Types)) + for idx, crawlType := range c.Types { + go func(idx int, crawlType *CrawlType) { + crawlInitURL := crawlType.BaseURL + crawlType.TypeURL + crawlType.InitSuffixURL + crawlName := crawlType.Name + crawlContent := crawlType.InitElement.Content + + switch crawlType.IsCrawl { + case false: + log.Printf("Type %s has been disabled to crawl.\n", crawlName) + default: + resp, err := crawler.Crawl(crawlInitURL, crawlType.CrawlReferer) + if err != nil { + e <- err + return + } + defer resp.Body.Close() + + gzipReader, err := gzip.NewReader(resp.Body) + if err != nil { + e <- err + return + } + defer gzipReader.Close() + + // Load the HTML document + doc, err := goquery.NewDocumentFromReader(gzipReader) + if err != nil { + e <- err + return + } + + // Find items + doc.Find(crawlType.InitElement.Container).Each(func(i int, s *goquery.Selection) { + // For each item found, get contents + if lastPageHref, exists := s.Find(crawlContent).Attr(crawlType.InitElement.Attr); !exists { + log.Printf("Cannot find HTML element `%s`\n", crawlContent) + } else { + matchedSlice := strings.Split(lastPageHref, crawlType.InitElement.Splitter) + if len(matchedSlice) == 2 { + maxPageString := matchedSlice[1] + if maxpage, err := strconv.Atoi(maxPageString); err != nil { + log.Printf("Failed to get max page of type %s.\n", crawlName) + } else { + c.Types[idx].MaxPage = maxpage + log.Printf("Type %s has pages: %d\n", crawlName, maxpage+1) + } + } + } + }) + } + wg.Done() + }(idx, crawlType) + } + + wg.Wait() + defer close(e) + return e +} + +// GenerateCrawlList generates lists for each crawl type to be crawled latter +func (c *Config) GenerateCrawlList() error { + for idx, crawlType := range c.Types { + if !crawlType.IsCrawl { + continue + } + maxpage := crawlType.MaxPage + from := crawlType.From + to := crawlType.To + + if to < 0 { + to = maxpage + } + + if from < 0 || from > maxpage || to > maxpage || from > to { + return ErrInvalidPageNumber + } + + log.Printf("Type %s will be crawled from page %d to %d", crawlType.Name, from, to) + + list := make([]string, 0, maxpage) + for i := from; i <= to; i++ { + url := crawlType.BaseURL + crawlType.TypeURL + crawlType.SuffixURL + strconv.Itoa(i) + list = append(list, url) + } + + c.Types[idx].CrawlList = list + } + return nil +} + +// Crawl gets HTML content for crawl types +func (c *Config) Crawl(rawResultChan chan map[*string]int) { + var wg sync.WaitGroup + workerPool := make(chan struct{}, c.Customize.CPUCores) + + for _, crawlType := range c.Types { + for _, url := range crawlType.CrawlList { + workerPool <- struct{}{} + wg.Add(1) + + go func(url string, crawlType *CrawlType) { + defer func() { + if err := recover(); err != nil { + log.Printf("Goroutine panic: fetching %v : %v\n", url, err) + } + }() + + container := crawlType.CrawlElement.Container + content := crawlType.CrawlElement.Content + attr := crawlType.CrawlElement.Attr + condition := crawlType.CrawlElement.Condition + + log.Println("Crawling:", url) + resp, err := crawler.Crawl(url, crawlType.CrawlReferer) + utils.Must(err) + defer resp.Body.Close() + + gzipReader, err := gzip.NewReader(resp.Body) + utils.Must(err) + defer gzipReader.Close() + + // Load the HTML document + doc, err := goquery.NewDocumentFromReader(gzipReader) + utils.Must(err) + + // Find items + doc.Find(container).Each(func(i int, s *goquery.Selection) { + percent := 0 + // For each item found, get contents + rawDomain, _ := s.Find(content).Attr(attr) + if blockedPercentage := strings.TrimSpace(s.Find(condition).Text()); blockedPercentage != "" { + percent, _ = strconv.Atoi(blockedPercentage[:len(blockedPercentage)-1]) + } + + rawResult := make(map[*string]int) + rawResult[&rawDomain] = percent + rawResultChan <- rawResult + }) + + wg.Done() + <-workerPool + }(url, crawlType) + } + } + + wg.Wait() + close(rawResultChan) +} + +// FilterAndWrite filters HTML conent and write results to files +func (c *Config) FilterAndWrite(rawResultChan chan map[*string]int) { + defer func() { + if err := recover(); err != nil { + log.Printf("Runtime panic: %v\n", err) + } + }() + + // Make output dir + utils.Must(os.MkdirAll(filepath.Join("./", c.Customize.OutputDir), 0755)) + + rawDomainFile, err := os.OpenFile(filepath.Join(c.Customize.OutputDir, c.Customize.RawFilename), os.O_WRONLY|os.O_CREATE, 0644) + utils.Must(err) + defer rawDomainFile.Close() + + finalDomainFile, err := os.OpenFile(filepath.Join(c.Customize.OutputDir, c.Customize.DomainFilename), os.O_WRONLY|os.O_CREATE, 0644) + utils.Must(err) + defer finalDomainFile.Close() + + finalIPfile, err := os.OpenFile(filepath.Join(c.Customize.OutputDir, c.Customize.IPFilename), os.O_WRONLY|os.O_CREATE, 0644) + utils.Must(err) + defer finalIPfile.Close() + + resultMap := make(map[string]struct{}) + domainReg := regexp.MustCompile(c.Filter.Regexp.Domain) + rawReader := bufio.NewWriter(rawDomainFile) + for result := range rawResultChan { + for url, percent := range result { + url := strings.ToLower(*url) + // Write raw results to raw.txt file + rawReader.WriteString(fmt.Sprintf("%s | %d\n", url, percent)) + + if percent >= c.Filter.Percent { + matchList := domainReg.FindStringSubmatch(url) + if len(matchList) > 0 { + domain := matchList[len(matchList)-2] + // Write filtered results to console + fmt.Printf("%s | %d\n", domain, percent) + // Write filtered results to map to make them unique + resultMap[domain] = struct{}{} + } + } + } + } + rawReader.Flush() + + resultSlice := make([]string, 0, len(resultMap)) + ipSlice := make([]string, 0, len(resultMap)) + ipReg := regexp.MustCompile(c.Filter.Regexp.IP) + for domainOrIP := range resultMap { + ipElem := ipReg.FindStringSubmatch(domainOrIP) + if len(ipElem) > 0 { + ipSlice = append(ipSlice, ipElem[0]) + continue + } + resultSlice = append(resultSlice, domainOrIP) + } + + // Unique and sort domain slice + sort.SliceStable(resultSlice, func(i, j int) bool { + return len(strings.Split(resultSlice[i], ".")) < len(strings.Split(resultSlice[j], ".")) + }) + resultSlice = buildTreeAndUnique(resultSlice) + sort.Strings(resultSlice) + + // Write filtered result to domains.txt file + domainReader := bufio.NewWriter(finalDomainFile) + for _, domain := range resultSlice { + domainReader.WriteString(fmt.Sprintf("%s\n", domain)) + } + domainReader.Flush() + + // Sort IP slice + sort.Strings(ipSlice) + + // Write IP results to ip.txt file + ipReader := bufio.NewWriter(finalIPfile) + for _, ip := range ipSlice { + ipReader.WriteString(fmt.Sprintf("%s\n", ip)) + } + ipReader.Flush() +} diff --git a/tree.go b/tree.go index 8d27fbb..dbe51b7 100644 --- a/tree.go +++ b/tree.go @@ -1,8 +1,6 @@ package main -import ( - "errors" -) +import "errors" type node struct { leaf bool diff --git a/utils/cpu.go b/utils/cpu.go deleted file mode 100644 index ad5b46a..0000000 --- a/utils/cpu.go +++ /dev/null @@ -1,20 +0,0 @@ -package utils - -import "runtime" - -// SetGOMAXPROCS sets precise number of Go processors -func SetGOMAXPROCS() (int, int) { - numCPUs := runtime.NumCPU() - orginalCPUs := numCPUs - switch { - case numCPUs <= 1: - numCPUs = 2 - case numCPUs <= 4: - numCPUs *= 3 - default: - numCPUs *= 2 - } - runtime.GOMAXPROCS(numCPUs) - - return orginalCPUs, numCPUs -} diff --git a/utils/error.go b/utils/error.go index 67e708a..aa7e077 100644 --- a/utils/error.go +++ b/utils/error.go @@ -1,8 +1,15 @@ package utils -// CheckError panics runtime error -func CheckError(err error) { +// Must panics runtime error +func Must(err error) { if err != nil { panic(err) } } + +func Must2(v interface{}, err error) interface{} { + if err != nil { + panic(err) + } + return v +}