Skip to content

Commit

Permalink
remove mandatory md5 check for custom datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
kennymckormick committed Mar 20, 2024
1 parent 4b12db4 commit 2674902
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 15 deletions.
27 changes: 13 additions & 14 deletions vlmeval/utils/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,18 @@ def isliststr(s):


def check_md5(data_path, dataset):
try:
with open(data_path, 'rb') as f:
hash = hashlib.new('md5')
for chunk in iter(lambda: f.read(2**20), b''):
hash.update(chunk)
if str(hash.hexdigest()) == dataset_md5_dict[dataset]:
return True
else:
warnings.warn('this data file is incomplete, so it needs to be downloaded again.')
return False
except:
if dataset not in dataset_md5_dict:
warnings.warn(f'We do not have an md5 record for dataset {dataset}, skip the md5 check. ')
return True
assert osp.exists(data_path)
with open(data_path, 'rb') as f:
hash = hashlib.new('md5')
for chunk in iter(lambda: f.read(2**20), b''):
hash.update(chunk)
if str(hash.hexdigest()) == dataset_md5_dict[dataset]:
return True
else:
warnings.warn('this data file is incomplete, so it needs to be downloaded again.')
return False


Expand Down Expand Up @@ -54,9 +55,7 @@ def __init__(self, dataset='MMBench', skip_noimg=True):
file_name = url.split('/')[-1]
data_path = osp.join(self.data_root, file_name)

if osp.exists(data_path) and (
md5(data_path) == dataset_md5_dict[dataset] if dataset in dataset_md5_dict else True
):
if osp.exists(data_path) and check_md5(data_path, dataset):
pass
else:
warnings.warn('The dataset tsv is not downloaded')
Expand Down
2 changes: 1 addition & 1 deletion vlmeval/utils/dataset_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
'OCRBench': 'OCRBench',
})

assert set(dataset_URLs) == set(img_root_map) == set(dataset_md5_dict)
assert set(dataset_URLs) == set(img_root_map)


def DATASET_TYPE(dataset):
Expand Down

0 comments on commit 2674902

Please sign in to comment.