-
Notifications
You must be signed in to change notification settings - Fork 30
/
wiki-dl.sh
executable file
·34 lines (31 loc) · 1.27 KB
/
wiki-dl.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/bin/bash
#
# Copyright (c) 2017-present, All rights reserved.
# Written by Julien Tissier <[email protected]>
#
# This file is part of Dict2vec.
#
# Dict2vec is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Dict2vec is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License at the root of this repository for
# more details.
#
# You should have received a copy of the GNU General Public License
# along with Dict2vec. If not, see <http://www.gnu.org/licenses/>.
DATA_DIR=./data
echo "Downloading English Wikipedia dump of January 2021 ..."
URL=https://dumps.wikimedia.org/enwiki/20210101/enwiki-20210101-pages-articles-multistream.xml.bz2
time wget -qO- $URL | bzip2 -d | perl wiki-parser.pl > "$DATA_DIR/enwiki-full"
echo "Done."
echo
echo "Creating enwiki-50M and enwiki-200M..."
head -c 295976756 "$DATA_DIR/enwiki-full" > "$DATA_DIR/enwiki-50M"
head -c 1164930898 "$DATA_DIR/enwiki-full" > "$DATA_DIR/enwiki-200M"
echo "Done."
echo