-
Notifications
You must be signed in to change notification settings - Fork 1
/
fetch-subs.py
executable file
·5217 lines (4280 loc) · 215 KB
/
fetch-subs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# TODO check internet connection = check DNS
# ping 1.1.1.1 + resolveip opensubtitles.org
# TODO visualize missing nums versus complete shards
# TODO scrape ratings + download counts of all subs
# intercept requests to only fetch the html code
# dont fetch images, styles, scripts
# TODO keep track of download count / quota
# when is the quota reset to zero?
# FIXME error: Site will be online soon. We are doing some necessary backups and upgrades. Thanks for understanding.
# <pre>Site will be online soon. We are doing some necessary backups and upgrades. Thanks for understanding.
# FIXME psutil.NoSuchProcess @ psutil
# logger_print(f'cleanup_main: killing child process {child.name()} pid {child.pid}')
# TODO check download_quota versus daily_quota
# also remove daily_quota_is_exceeded
# FIXME also create empty files for missing subs
# when scraping the latest 1000 subs in descending order
# FIXME integrate fetch-subs-add-zipfiles.sh into this script
# adding one file to new-subs-repo takes about 2 seconds
# and we can use this delay as "sleep time" for the scraper
# FIXME CDPError: DOM Error while querying
# NOTE lots of dead code and bad coding style here...
# but it kind-of works :P
# FIXME chromium is rendering pages much slower
# when the chromium window is not visible on the desktop
# = if the chromium window is "hidden" in the background
# TODO headful scraper with captcha solver
# fix: blocked -> start a new session
# TODO fetch missing subs of first release
# between sub ID 1 and 9180518
# watch "ls -lt new-subs/ | head"
# TODO wait between requests -> fix semaphore
# TODO fix ublock extension -> options.add_argument
# TODO set name of root logger -> def logger_print
# FIXME asyncio ERROR Task exception was never retrieved
# handle errors from aiohttp_chromium_session.get
# FIXME postprocess: fix wrong dmca entries
# examples:
# these files were not processed by new-subs-migrate.py
# because dmca entries exist in new-subs-repo/files.txt
# TODO also check files in new-subs-repo/trash/
"""
$ ls new-subs-repo/ | cat
9540221.aint.she.tweet.(1952).eng.1cd.zip
9540240.book.revue.(1946).eng.1cd.zip
9540304.forget.me.(1994).dut.1cd.zip
9540310.premier.voyage.(1980).ell.1cd.zip
9540353.queen.slim.(2019).fin.1cd.zip
9540451.jewish.matchmaking.s01.e01.episode.1.1.(2023).spa.1cd.zip
9540476.jewish.matchmaking.s01.e07.so.the.song.goes.().heb.1cd.zip
9540515.jewish.matchmaking.s01.e01.episode.1.1.(2023).dut.1cd.zip
9540545.jewish.matchmaking.s01.e04.year.of.the.cindy.().rus.1cd.zip
9540550.not-found
9540572.jewish.matchmaking.s01.e04.year.of.the.cindy.().chi.1cd.zip
9540630.love.village.s01.e04.episode.1.4.().ita.1cd.zip
9540653.love.village.s01.e03.episode.1.3.().ara.1cd.zip
9540664.love.village.s01.e02.episode.1.2.().pob.1cd.zip
9540667.luke.cage.s01.e03.whos.gonna.take.the.weight.(2016).spl.1cd.zip
9540722.mama.ist.unmoglich.s01.e03.familientreffen.(1997).ger.1cd.zip
$ cat new-subs-repo/files.txt | grep -e 9540221 -e 9540240 -e 9540304 -e 9540310 -e 9540353 -e 9540451 -e 9540476
9540304.dmca
9540451.dmca
9540221.dmca
9540240.dmca
9540353.dmca
9540310.dmca
9540476.dmca
TODO verify status 404
https://github.com/milahu/opensubtitles-scraper-test/actions/runs/4908766906/jobs/8764739331
2023-05-07 18:47:38,811 INFO 9540550 404 dt=0.551 dt_avg=0.536 type=text/html; charset=UTF-8 quota=None
# https://www.opensubtitles.org/en/subtitles/9540550/jewish-matchmaking-sv
# These subtitles were disabled, you should not use them (pipporan @ 06/05/2023 04:28:54)
# Subtitles was splitted to - 9540551 - 9540552 - 9540553 - 9540554 - 9540555 - 9540556 - 9540557 - 9540558
TODO store the 404 error message? example: "These subtitles were disabled ..."
"""
# TODO quiet: remove logging output " dt={dt_download:.3f} dt_avg={dt_download_avg:.3f}"
# FIXME missing subs from github action
# bug in nums_done?
# FIXME deadloop. stop scraper at http 429 = rate-limit exceeded
# https://github.com/milahu/opensubtitles-scraper-test/actions/runs/4906991416/jobs/8761814969
# expected time: 1E6 * 0.1 / 3600 = 28 hours
# no. i over-estimated the number of requests
# it was only 300K requests, and it was done in about 1.5 days
# not bad, zenrows.com! :)
# TODO fe-fetch recent downloads
# give opensubtitles.org some time for moderation
# some time = some days = 3 days?
# NOTE zipfiles can change name over time
# example:
# a: 9524294.delete.me.new.beginnings.().eng.1cd.zip
# b: 9524294.delete.me.s02.e06.new.beginnings.().eng.1cd.zip
# python stdlib modules
import sys
import os
import re
import urllib.request
import logging
import time
import datetime
import random
import hashlib
import subprocess
import json
import glob
import collections
import zipfile
import base64
import asyncio
import argparse
import atexit
import traceback
import shlex
import shutil
import tempfile
import _io
import string
import itertools
import http.cookiejar
import math
import sqlite3
# pypi modules
import aiohttp
import requests
import magic # libmagic
import psutil
import nest_asyncio
import cryptography.hazmat.primitives.asymmetric.rsa
import cryptography.hazmat.primitives.serialization
import cryptography.x509
import cryptography.hazmat.primitives.hashes
# allow nesting multiple asyncio event loops
# fix: RuntimeError: This event loop is already running
nest_asyncio.apply()
sys.path.append("lib/thirdparty/aiohttp_chromium/src")
import aiohttp_chromium
print("imported aiohttp_chromium", aiohttp_chromium)
# TODO copy to aiohttp_chromium
from selenium_driverless.types.by import By
# selenium_webdriver.__package__ == "selenium_driverless"
from selenium_driverless.types.webelement import NoSuchElementException
from selenium_driverless.types.deserialize import StaleJSRemoteObjReference
# used by FlareSolverr
# https://github.com/FlareSolverr/FlareSolverr
# FlareSolverr/src/undetected_chromedriver/
# TODO why not use undetected_chromedriver directly
# FIXME undetected_chromedriver fails to bypass cloudflare. wtf? works in FlareSolverr
# TODO why exactly? what would FlareSolverr do?
# https://github.com/ultrafunkamsterdam/undetected-chromedriver
# NOTE my patched version of undetected_chromedriver
# also accepts these kwargs in chrome_args
# driver_executable_path
# driver_executable_is_patched
# browser_executable_path
# FIXME driver.get does not support the "timeout" kwarg
# await asyncify(driver.get("https://nowsecure.nl/#relax", timeout=20))
#import undetected_chromedriver as selenium_webdriver
# selenium_webdriver.__package__ == "undetected_chromedriver"
# TODO seleniumwire with socks5 proxy https://github.com/wkeeling/selenium-wire/issues/656
"""
# TODO go back to flaresolverr + requests/aiohttp
# no. seleniumwire is not working to bypass cloudflare
# no, this does not use undetected_chromedriver
#import seleniumwire.webdriver as selenium_webdriver
# make sure that undetected_chromedriver is installed
import undetected_chromedriver as _undetected_chromedriver
# https://github.com/wkeeling/selenium-wire#bot-detection
# FIXME selenium-wire's certificate (ca.crt) is not added to chromium
# https://github.com/wkeeling/selenium-wire/tree/master#certificates
# FIXME import of ca.crt fails:
# Certificate Import Error
# The Private Key for this Client Certificate is missing or invalid
# TODO create ca.pem file
# https://github.com/wkeeling/selenium-wire
import seleniumwire.undetected_chromedriver as undetected_chromedriver
"""
# TODO make seleniumwire work with latest mitmproxy
# ImportError: cannot import name 'connections' from 'mitmproxy'
#import seleniumwire.undetected_chromedriver as selenium_webdriver
# debug FlareSolverr
# https://github.com/FlareSolverr/FlareSolverr/discussions/806
# LOG_LEVEL=debug LOG_HTML=true HEADLESS=false
# flaresolverr module
# https://github.com/milahu/nur-packages/blob/master/pkgs/python3/pkgs/flaresolverr/flaresolverr.nix
#import flaresolverr.flaresolverr
# this would work like flaresolverr.flaresolverr.main()
# but we must set envs:
# CHROME_EXE_PATH
# PATCHED_DRIVER_PATH
# PATCHED_DRIVER_IS_PATCHED
# so instead, we use subprocess.Popen to run flaresolverr
# FIXME the default selenium api does not return the response status
# https://stackoverflow.com/questions/6509628/how-to-get-http-response-code-using-selenium-webdriver
# TODO? https://github.com/kaliiiiiiiiii/Selenium-Driverless#use-events
# local modules
import pyrfc6266
#import nssdb
from AiohttpMozillaCookieJar import AiohttpMozillaCookieJar
# https://www.zenrows.com/ # Startup plan
#max_concurrency = 25 # concurrency limit was reached
max_concurrency = 10
# unexpected response_status 403. content: b'{"code":"BLK0001","detail":"Your IP address has been blocked for exceeding the maximum error rate al'...
# -> change_ipaddr()
max_concurrency = 1 # debug
#max_concurrency = 2
# no. scraping nums in random order is required to bypass blocking
# when scraping nums in linear order, the scraper hangs after: ClientResponse.content: done
# ... probably hangs at (await response.content.read())
#fetch_nums_in_random_order = False
fetch_nums_in_random_order = True
# no. not needed because i over-estimated the missing number (?)
# the last batches should run in sequential order
# last batches = we are limited by API credits
# we dont want holes in the dataset
# done 300K = 30% of 1M
# start sequential at around 80% (better too early)
# so 80% would be
# first_num = 9180519
# options.last_num = 9520468
# 9180519 + 0.8 * 1E6 = 9980519
# 9520468 - 0.2 * 1E6 = 9320468 # pick
#sequential_fetching_after_num = 9320468
#options.sample_size = 10000 # randomize the last 4 digits
#options.sample_size = 1000 # randomize the last 3 digits
#options.sample_size = 100 # randomize the last 2 digits
#options.sample_size = 10 # randomize the last 1 digit
#if False:
#if True:
# debug
#max_concurrency = 1
#options.sample_size = 10
# captcha after 30 requests
#options.proxy_provider = "chromium"
# not working. blocked by cloudflare
#options.proxy_provider = "pyppeteer"
#fetcher_lib = "requests"
pyppeteer_headless = True
pyppeteer_headless = False # debug
# note: these API keys are all expired
#options.proxy_provider = "scrapfly.io"
try:
from fetch_subs_secrets import proxy_scrapfly_io_api_key
except ImportError:
proxy_scrapfly_io_api_key = None
proxy_scrapfly_io_cache_response = True
#options.proxy_provider = "scrapingdog.com"
api_key_scrapingdog_com = "643f9f3b575aa419c1d7218a"
#options.proxy_provider = "webscraping.ai"
api_key_webscraping_ai = "b948b414-dd1d-4d98-8688-67f154a74fe8"
webscraping_ai_option_proxy = "datacenter"
#webscraping_ai_option_proxy = "residential"
#options.proxy_provider = "zenrows.com"
fetcher_lib = "aiohttp"
try:
from fetch_subs_secrets import api_key_zenrows_com
except ImportError:
api_key_zenrows_com = "88d22df90b3a4c252b480dc8847872dac59db0e0" # expired
# opensubtitles.org
try:
from fetch_subs_secrets import opensubtitles_org_logins
except ImportError:
opensubtitles_org_logins = None
opensubtitles_org_login_cookies_txt_path = None
opensubtitles_org_login_cookie_jar = None
# opensubtitles.com
# https://opensubtitles.stoplight.io/docs/opensubtitles-api/e3750fd63a100-getting-started
# Your consumer can query the API on its own, and download 5 subtitles per IP's per 24 hours,
# but a user must be authenticated to download more.
# Users will then be able to download as many subtitles as their ranks allows,
# from 10 as simple signed up user, to 1000 for VIP user.
# Download counters resets at midnight UTC time
try:
from fetch_subs_secrets import opensubtitles_com_logins
except ImportError:
opensubtitles_com_logins = None
opensubtitles_com_login_headers = {}
class Config:
zenrows_com_antibot = False
zenrows_com_js = False
config = Config()
#options.proxy_provider = "scraperbox.com"
proxy_scraperbox_com_api_key = "56B1354FD63EB435CA1A9096B706BD55"
#options.proxy_provider = "scrapingant.com"
api_key_scrapingant_com = "6ae0de59fad34337b2ee86814857278a"
new_subs_dir = "new-subs"
"""
new_subs_repo_dir = "new-subs-repo"
#new_subs_dir = "new-subs-temp-debug"
"""
new_subs_repo_shards_dir = "new-subs-repo-shards"
def datetime_str():
# https://stackoverflow.com/questions/2150739/iso-time-iso-8601-in-python#28147286
return datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ")
global_remove_files_when_done = []
def make_x509_cert_pem_bytes() -> bytes:
# generate a self-signed x509 certificate in python
# https://cryptography.io/en/latest/x509/tutorial/#creating-a-self-signed-certificate
# rename imports
rsa = cryptography.hazmat.primitives.asymmetric.rsa
serialization = cryptography.hazmat.primitives.serialization
x509 = cryptography.x509
NameOID = cryptography.x509.oid.NameOID
hashes = cryptography.hazmat.primitives.hashes
# Generate our key
key = rsa.generate_private_key(
public_exponent=65537,
key_size=2048,
)
# Various details about who we are. For a self-signed certificate the
# subject and issuer are always the same.
subject = issuer = x509.Name([
x509.NameAttribute(NameOID.COUNTRY_NAME, "US"),
x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, "Some Province"),
x509.NameAttribute(NameOID.LOCALITY_NAME, "Some Locality"),
x509.NameAttribute(NameOID.ORGANIZATION_NAME, "Some Organization"),
x509.NameAttribute(NameOID.COMMON_NAME, "some-common-name.com"),
])
# FIXME chromium dont like our cert
"""
Certification Authority Import Error
The file contained one certificate, which was not imported:
some-common-name.com: Not a Certification Authority
"""
cert = x509.CertificateBuilder().subject_name(
subject
).issuer_name(
issuer
).public_key(
key.public_key()
).serial_number(
x509.random_serial_number()
).not_valid_before(
datetime.datetime.now(datetime.timezone.utc)
).not_valid_after(
# Our certificate will be valid for about 100 years
datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(weeks=100*52)
).add_extension(
x509.SubjectAlternativeName([x509.DNSName("localhost")]),
critical=False,
# Sign our certificate with our private key
).sign(key, hashes.SHA256())
return cert.public_bytes(serialization.Encoding.PEM)
# https://stackoverflow.com/a/20372465/10440128
from inspect import currentframe
def __line__():
cf = currentframe()
return cf.f_back.f_lineno
# https://www.opensubtitles.org/en/search/subs
# https://www.opensubtitles.org/ # New subtitles
#options.last_num = 9520468 # 2023-04-25
#options.last_num = 9521948 # 2023-04-26
#options.last_num = 9523112 # 2023-04-27
#options.last_num = 9530994 # 2023-05-01
#options.last_num = 9531985 # 2023-05-01
#options.last_num = 9533109 # 2023-05-02
parser = argparse.ArgumentParser(
prog='fetch-subs',
description='Fetch subtitles',
#epilog='Text at the bottom of help',
)
default_jobs = 1 # see also: max_concurrency
default_num_downloads = 25
# with larger samples, produce more incomplete shards
# see also fetch_nums_in_random_order
default_sample_size = 1000
#default_sample_size = 200 # too low? blocked after some requests
"""
proxy_provider_values = [
#"pyppeteer",
"chromium",
"zenrows.com",
]
"""
default_proxy_provider = None
#parser.add_argument('filename')
parser.add_argument(
'--proxy-provider',
dest="proxy_provider", # options.proxy_provider
default=default_proxy_provider,
#choices=proxy_provider_values,
type=str,
metavar="S",
help=(
f"proxy provider. "
f"default: {default_proxy_provider}. "
#f"values: {', '.join(proxy_provider_values)}"
),
)
parser.add_argument(
'--start-vnc-client',
dest="start_vnc_client", # options.start_vnc_client
action='store_true',
help=(
f"start a local vnc client. "
f"useful for running the scraper on a local machine. "
),
)
parser.add_argument(
'--reverse-vnc-servers',
dest="vnc_client_list", # options.vnc_client_list
default=[],
type=str,
metavar="S",
nargs="*",
help=(
f"reverse vnc servers. "
f'only used with proxy provider "chromium". '
f"this will try to connect to one of the ssh servers, "
f"to create a TCP tunnel between the VNC server and vnc_port on the ssh server. "
f"The default vnc_port is 5901. "
f'alternative: pass a space-delimited list to the environment variable "REVERSE_VNC_SERVERS". '
f"format: [user@]host[:ssh_port[:vnc_port]]. "
f"example: --reverse-vnc-servers example.com [email protected]:22:1234"
),
)
parser.add_argument(
'--ssh-id-file',
dest="ssh_id_file_path", # options.ssh_id_file_path
default=None,
type=str,
metavar="S",
help=(
f"ssh id file path. "
f'used for "ssh -i path/to/ssh-id-file" to connect to a vnc client. '
f"example: ~/.ssh/id_rsa"
),
)
# see also: max_concurrency
parser.add_argument(
'--jobs',
default=default_jobs,
type=int,
metavar="N",
help=f"how many jobs to run in parallel. default: {default_jobs}",
)
parser.add_argument(
'--num-downloads',
dest="num_downloads",
default=default_num_downloads,
#type=int,
metavar="N",
help=(
f"limit the number of downloads. "
f"default: {default_num_downloads}. "
f"can be a range like 10-20, then value is random."
),
)
parser.add_argument(
'--sample-size',
dest="sample_size",
default=default_sample_size,
type=int,
metavar="N",
help=f"size of random sample. default: {default_sample_size}",
)
parser.add_argument(
'--first-num',
dest="first_num",
default=None,
type=int,
metavar="N",
help="first subtitle number. default: get from store",
)
parser.add_argument(
'--last-num',
dest="last_num",
default=None,
type=int,
metavar="N",
help="last subtitle number. default: get from remote",
)
parser.add_argument(
"--show-ip-address",
dest="show_ip_address",
default=False,
action="store_true",
help="show IP address. default: false. note: this is slow",
)
parser.add_argument(
"--username",
dest="username",
default=None,
type=str,
metavar="S",
help="username for login",
)
parser.add_argument(
"--debug",
default=False,
action="store_true",
help="show debug messages",
)
parser.add_argument(
"--force-download",
dest="force_download",
default=False,
action="store_true",
help="also download when files exist",
)
parser.add_argument(
"--tempdir",
dest="tempdir",
default=None,
type=str,
metavar="path",
help="path to tempdir",
)
parser.add_argument(
"--metadata-db",
dest="metadata_db",
default=None,
type=str,
metavar="path",
help="path to subtitles_all.db - parsed from subtitles_all.txt.gz",
)
parser.add_argument(
"--only-update-metadata-db",
dest="only_update_metadata_db",
action='store_true',
help="update subtitles_all.db and exit",
)
#options = parser.parse_args(sys.argv)
options = parser.parse_args()
options.vnc_client_list += re.split(r"\s+", os.environ.get("REVERSE_VNC_SERVERS", ""))
logging_level = "INFO"
if options.debug:
# TODO disable debug log from selenium (too verbose)
logging_level = "DEBUG"
logging.basicConfig(
#format='%(asctime)s %(levelname)s %(message)s',
# also log the logger %(name)s, so we can filter by logger name
format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging_level,
)
logger = logging.getLogger("fetch-subs")
def logger_print(*args):
logger.info(" ".join(map(str, args)))
if type(options.num_downloads) == str:
logger_print("options.num_downloads", repr(options.num_downloads))
if re.match(r"^\d+$", options.num_downloads):
options.num_downloads = int(options.num_downloads)
elif re.match(r"^(\d+)-(\d+)$", options.num_downloads):
m = re.match(r"^(\d+)-(\d+)$", options.num_downloads)
options.num_downloads = random.randint(int(m.group(1)), int(m.group(2)))
logging.info(f"options.num_downloads: {options.num_downloads}")
# global state
metadata_db_con = None
metadata_db_cur = None
async def update_metadata_db():
# FIXME subtitles_all.txt.gz-parse.py
#return # dont update
if not options.metadata_db:
return
max_age = 10*24*60*60 # 10 days
age = time.time() - os.path.getmtime(options.metadata_db)
if age <= max_age:
return
# https://stackoverflow.com/questions/538666/format-timedelta-to-string
def format_age(age):
age = datetime.timedelta(seconds=age)
return str(age)
logger_print(f"updating metadata db {repr(options.metadata_db)}. age {format_age(age)} > max_age {format_age(max_age)}")
# debug: use an existing .txt.gz file to avoid re-downloading
existing_txt_gz_path = None
#existing_txt_gz_path = "subtitles_all.txt.gz.20240714T173551Z"
if existing_txt_gz_path:
txt_gz_path = existing_txt_gz_path
logger_print(f"updating metadata db: using existing {txt_gz_path} - FIXME disable existing_txt_gz_path")
else:
# download the .txt.gz file
# file size 400 MByte @ 2024-07-15
# so this should fit into RAM
# but ideally, aiohttp_chromium_session.get should write directly to disk
url = "https://dl.opensubtitles.org/addons/export/subtitles_all.txt.gz"
txt_gz_path = f"subtitles_all.txt.gz.{datetime_str()}"
logger_print(f"updating metadata db: fetching {txt_gz_path} from {url}")
# FIXME?
aiohttp_chromium_session = await aiohttp_chromium.ClientSession(
#cookie_jar=cookie_jar,
#tempdir=tempdir,
_headless=True,
)
async def response_cleanup_chromium():
await response.__aexit__(None, None, None)
response_cleanup = response_cleanup_chromium
try:
response = await aiohttp_chromium_session.get(url)
logger_print(f"updating metadata db: mv {response._filepath} {txt_gz_path}")
#os.rename(response._filepath, txt_gz_path) # why not?
logger_print(f"response._filepath = {repr(response._filepath)}")
# wait until response is complete
# FIXME this can hang. TODO timeout + retry
# TODO open url to monitor download progress: chrome://downloads/
# TODO log download-progress every 30 seconds to debug log
logger_print("response._wait_complete ...")
t1 = time.time()
try:
# the download takes between 4 minutes and 4 hours
# -> timeout: 10 hours
await response._wait_complete(timeout=10*60*60)
except TimeoutError:
await response_cleanup()
raise Exception(f"{num} download failed")
t2 = time.time()
logger_print("response._wait_complete done after {(t2 - t1):.3f} seconds")
shutil.move(response._filepath, txt_gz_path)
#except asyncio.exceptions.TimeoutError as e:
except Exception as e:
logger_print(f"updating metadata db failed: {e}")
await aiohttp_chromium_session.close()
return
await aiohttp_chromium_session.close()
#### response_cleanup
# TODO keep only one old file, delete all older versions
keep_old_file = False
keep_old_file = True # debug
logger_print(f"updating metadata db: parsing tsv to sqlite ...")
# parser fails if db_path exists so use a tempfile
#db_path_temp = f"{txt_gz_path}.db.temp.{datetime_str()}"
db_path = f"{txt_gz_path}.db"
error_path = f"{db_path}.error"
debug_path = f"{db_path}.debug"
table_name = "subz_metadata"
if os.path.exists(db_path):
# parser fails if db_path exists
#logger_print(f"updating metadata db: error: output file exists: {db_path}")
#return
logger_print(f"updating metadata db: deleting old output file: {db_path}")
os.unlink(db_path)
logger_print(f"updating metadata db: writing {db_path}")
args = [
sys.executable, # python
"-u", # unbuffer output
"subtitles_all.txt.gz-parse.py",
db_path,
table_name,
txt_gz_path,
error_path,
debug_path,
]
logger_print(f"updating metadata db: running: {shlex.join(args)}")
proc = subprocess.run(args)
if proc.returncode != 0:
logger_print(f"updating metadata db: parsing tsv to sqlite failed")
return
logger_print(f"updating metadata db: parsing tsv to sqlite done")
if os.path.islink(options.metadata_db):
if not keep_old_file:
# note: this will not follow symlinks
link_target = os.readlink(options.metadata_db)
logger_print(f"updating metadata db: rm {link_target}")
os.unlink(link_target)
logger_print(f"updating metadata db: rm {options.metadata_db}")
os.unlink(options.metadata_db)
else:
if keep_old_file:
bak_path = options.metadata_db + f".bak.{datetime_str()}"
logger_print(f"updating metadata db: mv {options.metadata_db} {bak_path}")
os.rename(options.metadata_db, bak_path)
else:
logger_print(f"updating metadata db: rm {options.metadata_db}")
os.unlink(options.metadata_db)
logger_print(f"updating metadata db: ln -s {db_path} {options.metadata_db}")
os.symlink(db_path, options.metadata_db)
"""
await update_metadata_db()
if options.metadata_db:
logger_print(f"using metadata db {repr(options.metadata_db)}")
metadata_db_con = sqlite3.connect(options.metadata_db)
metadata_db_cur = metadata_db_con.cursor()
"""
# postprocess: fetch missing subs
# example: https://www.opensubtitles.org/en/subtitles/9205951
# this is a bug in opensubtitles.org
# the server returns infinite cyclic redirect via
# https://www.opensubtitles.org/en/msg-dmca
# and zenrows says: error: need javascript
# ... so these files were deleted because of dmca takedown requests (by copyright trolls)
missing_numbers = []
missing_numbers_txt_path = "missing_numbers.txt"
if os.path.exists(missing_numbers_txt_path):
logger_print(f"loading missing_numbers from {missing_numbers_txt_path}")
with open(missing_numbers_txt_path, "r") as f:
try:
nums = list(map(int, f.read().strip().split("\n")))
except ValueError:
# ValueError: invalid literal for int() with base 10: ''
nums = []
logger_print(f"loaded missing_numbers from {missing_numbers_txt_path}: {nums}")
missing_numbers += nums
logger.debug(f"{__line__()} missing_numbers = {missing_numbers}")
if missing_numbers:
# filter
# os.path.exists
# glob.glob
# os.listdir
logger_print(f"fetching {len(missing_numbers)} missing numbers:", missing_numbers)
# postprocess: create empty dmca files
# TODO detect these files while scraping
# in the future, zenrows may return a different error than
# RESP001 (Could not get content. try enabling javascript rendering)
# zenrows support:
# > The error might be misleading, but apart from changing that, we can't do anything else.
# > BTW, if they return a status code according to the error, you might get it back with original_status=true
#for num in missing_numbers:
# # create empty file
# filename_dcma = f"{new_subs_dir}/{num}.dmca"
# open(filename_dcma, 'a').close() # create empty file
#raise Exception("done postprocessing")
# sleep X seconds after each download
# to avoid http status "429 Too Many Requests"
#sleep_each_min, sleep_each_max = 0, 3
#sleep_each_min, sleep_each_max = 0, 20
#sleep_each_min, sleep_each_max = 0, 200
max_downloads_per_day = 1000 # vip account
#sleep_each_avg = (24 * 60 * 60) / max_downloads_per_day
new_session_sleep_min, new_session_sleep_max = 5*60, 10*60
# sleep X seconds after getting blocked for too many requests
blocked_sleep_min, blocked_sleep_max = 2.2*60*60, 2.6*60*60
# quota: 200 requests per day in chunks of 20 requests
# = 20 requests every 2.4 hours
# sleep X seconds after blocked by server
sleep_blocked = 24*60*60
# sleep X seconds after changing IP address
sleep_change_ipaddr = 10
sleep_change_ipaddr_min, sleep_change_ipaddr_max = 5*60, 15*60 # 5...10 minutes
sleep_change_ipaddr_min, sleep_change_ipaddr_max = 15*60, 45*60 # 15...45 minutes
#sleep_change_ipaddr_min, sleep_change_ipaddr_max = 5*60, 15*60 # 5...15 minutes
#sleep_change_ipaddr_min, sleep_change_ipaddr_max = 5*60, 45*60 # 5...45 minutes
#sleep_change_ipaddr_min, sleep_change_ipaddr_max = 3*60, 15*60 # 3...15 minutes
# sleep 5 minutes = 30*24*60/5 = 8640 downloads per day
# sleep 10 minutes = 30*24*60/10 = 4320 downloads per day
# sleep 15 minutes = 30*24*60/15 = 2880 downloads per day
# sleep 30 minutes = 30*24*60/30 = 1440 downloads per day
# sleep 45 minutes = 30*24*60/45 = 960 downloads per day
is_greedy = False
#is_greedy = True
if is_greedy:
sleep_each_min, sleep_each_max = 0, 0
sleep_change_ipaddr = 0
def random_numbers_with_sum(numbers_len, numbers_sum, numbers_dev=0.25, sum_error_max=0.005):
"""
generate a list of N random integers
where the sum of all integers is about S
plusminus a small error
"""
numbers_avg = numbers_sum / numbers_len
numbers_min = round((1 - numbers_dev) * numbers_avg)
numbers_max = round((1 + numbers_dev) * numbers_avg)
while True:
numbers = []
for i in range(numbers_len):
numbers.append(random.randint(numbers_min, numbers_max))
sum_error = abs(1 - (sum(numbers) / numbers_sum))
if sum_error < sum_error_max:
return numbers
sleep_each_times = None
def get_sleep_each_time():
# rate-limiting by cloudflare after 33...35 requests is not appeased by waiting
return 0
# no effect on rate-limiting
return random.randint(0, 5)
#return random.randint(0, 10) # debug
global sleep_each_times
if not sleep_each_times:
# 24 hours
#sleep_each_times = random_numbers_with_sum(max_downloads_per_day, (24 * 60 * 60))
# 6 hours: use less memory for sleep_each_times
sleep_each_times = random_numbers_with_sum(round(max_downloads_per_day / 4), (6 * 60 * 60))
return sleep_each_times.pop()
try:
from fetch_subs_secrets import fritzbox_login
except ImportError:
fritzbox_login = None
# too complex
# seems to require config on fritzbox to "permit access for applications"
#from change_ipaddr_fritzbox import change_ipaddr_fritzbox
# -> just use selenium
async def change_ipaddr():
raise ValueError("missing config for change_ipaddr. hint: fritzbox_login")
#return change_ipaddr_fritzbox()
def change_ipaddr_openwrt():
#dev = "wan"
# note: you must setup ssh public key authentication in openwrt
# https://openwrt.org/docs/guide-user/security/dropbear.public-key.auth
dev = "wan"
dev_ifconfig = "pppoe-wan"
def get_ipaddr():
# get current IP address
proc = subprocess.run(
[
"ssh",
f"ifconfig {dev_ifconfig}"
],
check=True,
capture_output=True,
timeout=10,
encoding="utf8",
)
# inet addr:79.253.14.204 P-t-P:62.155.242.79 Mask:255.255.255.255
#logger_print("proc.stdout", repr(proc.stdout))
match = re.search(r"inet addr:(\d+\.\d+\.\d+\.\d+) ", proc.stdout)
#logger_print("match", repr(match))
ipaddr = match.group(1)