Skip to content

Commit

Permalink
code linting
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Oct 28, 2021
1 parent 26c7f2e commit 776d706
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 6 deletions.
13 changes: 11 additions & 2 deletions tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def test_cli_pipeline():
# teststring = f.read()
#result = cli.examine(teststring, args)
#assert '[link](testlink.html)' in result # and 'test.jpg' in result

# Crawling
testargs = ['', '--crawl', 'https://httpbin.org/html']
with patch.object(sys, 'argv', testargs):
Expand All @@ -286,14 +287,22 @@ def test_cli_pipeline():
with redirect_stdout(f):
cli_utils.cli_crawler(args)
assert len(f.getvalue()) == 0
testargs = ['', '--crawl', 'https://httpbin.org/links/1/1', '--list']
# links permitted
testargs = ['', '--crawl', 'https://httpbin.org/links/1/1', '--list', '--parallel', '1']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
print(f.getvalue())
assert f.getvalue() == 'https://httpbin.org/links/1/0\n'
# 0 links permitted
args.crawl = 'https://httpbin.org/links/4/4'
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args, n=0)
# print(f.getvalue())
assert len(f.getvalue().split('\n')) == 5

# Exploration (Sitemap + Crawl)
testargs = ['', '--explore', 'https://httpbin.org/html']
with patch.object(sys, 'argv', testargs):
Expand Down
1 change: 0 additions & 1 deletion tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ def test_queue():
]
)
}

args.archived = True
args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
config = use_config(filename=args.config_file)
Expand Down
6 changes: 3 additions & 3 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def process_result(htmlstring, args, url, counter, config):
def download_queue_processing(domain_dict, args, counter, config):
'''Implement a download queue consumer, single- or multi-threaded'''
sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME')
backoff_dict, errors = dict(), []
backoff_dict, errors = {}, []
while domain_dict:
bufferlist, download_threads, domain_dict, backoff_dict = load_download_buffer(domain_dict, backoff_dict, sleep_time, threads=args.parallel)
# process downloads
Expand All @@ -240,7 +240,7 @@ def cli_crawler(args, n=30, domain_dict=None):
and prints the links found in the process'''
config = use_config(filename=args.config_file)
sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME')
counter, crawlinfo, backoff_dict = None, dict(), dict()
counter, crawlinfo, backoff_dict = None, {}, {}
# load input URLs
if domain_dict is None:
domain_dict = load_input_dict(args)
Expand Down Expand Up @@ -276,7 +276,7 @@ def cli_crawler(args, n=30, domain_dict=None):
# if args.archived is True:
# errors.append(url)
# early exit if maximum count is reached
if any(i >= n for i in [crawlinfo[site]['count'] for site in crawlinfo]):
if any(i >= n for i in [dictvalue['count'] for _, dictvalue in crawlinfo.items()]):
break
# print results
for website in sorted(domain_dict):
Expand Down

0 comments on commit 776d706

Please sign in to comment.