code linting

adbar · Oct 28, 2021 · 776d706 · 776d706
1 parent 26c7f2e
commit 776d706
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 6 deletions.
diff --git a/tests/cli_tests.py b/tests/cli_tests.py
@@ -278,6 +278,7 @@ def test_cli_pipeline():
     #    teststring = f.read()
     #result = cli.examine(teststring, args)
     #assert '[link](testlink.html)' in result # and 'test.jpg' in result
+
     # Crawling
     testargs = ['', '--crawl', 'https://httpbin.org/html']
     with patch.object(sys, 'argv', testargs):
@@ -286,14 +287,22 @@ def test_cli_pipeline():
     with redirect_stdout(f):
         cli_utils.cli_crawler(args)
     assert len(f.getvalue()) == 0
-    testargs = ['', '--crawl', 'https://httpbin.org/links/1/1', '--list']
+    # links permitted
+    testargs = ['', '--crawl', 'https://httpbin.org/links/1/1', '--list', '--parallel', '1']
     with patch.object(sys, 'argv', testargs):
         args = cli.parse_args(testargs)
     f = io.StringIO()
     with redirect_stdout(f):
         cli_utils.cli_crawler(args)
-    print(f.getvalue())
     assert f.getvalue() == 'https://httpbin.org/links/1/0\n'
+    # 0 links permitted
+    args.crawl = 'https://httpbin.org/links/4/4'
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli_utils.cli_crawler(args, n=0)
+    # print(f.getvalue())
+    assert len(f.getvalue().split('\n')) == 5
+
     # Exploration (Sitemap + Crawl)
     testargs = ['', '--explore', 'https://httpbin.org/html']
     with patch.object(sys, 'argv', testargs):

diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py
@@ -84,7 +84,6 @@ def test_queue():
             ]
         )
     }
-
     args.archived = True
     args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
     config = use_config(filename=args.config_file)

diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
@@ -220,7 +220,7 @@ def process_result(htmlstring, args, url, counter, config):
 def download_queue_processing(domain_dict, args, counter, config):
     '''Implement a download queue consumer, single- or multi-threaded'''
     sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME')
-    backoff_dict, errors = dict(), []
+    backoff_dict, errors = {}, []
     while domain_dict:
         bufferlist, download_threads, domain_dict, backoff_dict = load_download_buffer(domain_dict, backoff_dict, sleep_time, threads=args.parallel)
         # process downloads
@@ -240,7 +240,7 @@ def cli_crawler(args, n=30, domain_dict=None):
        and prints the links found in the process'''
     config = use_config(filename=args.config_file)
     sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME')
-    counter, crawlinfo, backoff_dict = None, dict(), dict()
+    counter, crawlinfo, backoff_dict = None, {}, {}
     # load input URLs
     if domain_dict is None:
         domain_dict = load_input_dict(args)
@@ -276,7 +276,7 @@ def cli_crawler(args, n=30, domain_dict=None):
                 #    if args.archived is True:
                 #        errors.append(url)
         # early exit if maximum count is reached
-        if any(i >= n for i in [crawlinfo[site]['count'] for site in crawlinfo]):
+        if any(i >= n for i in [dictvalue['count'] for _, dictvalue in crawlinfo.items()]):
             break
     # print results
     for website in sorted(domain_dict):
-Original file line number
+Diff line change
@@ Expand Up / @@ -84,7 +84,6 @@ def test_queue(): @@
                 ]
             )
         }
         args.archived = True
         args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
         config = use_config(filename=args.config_file)
@@ Expand Down @@