-
Notifications
You must be signed in to change notification settings - Fork 4
/
parser.js
100 lines (84 loc) · 1.9 KB
/
parser.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
var async = require('async');
var csv = require('csv');
var fs = require('fs');
// Stats
var stats = {
authors: {
total: 0,
unique: 0,
deleted: 0
}
};
function createJson (header, post) {
var jsonPost = {};
for(var i = 0, len = header.length; i < len; i++) {
jsonPost[header[i]] = post[i];
}
return jsonPost;
}
function transformCsvToJson (posts) {
var header = posts[0];
posts = posts.splice(1);
var jsonPosts = [];
var post;
for(var i = 0, len = posts.length; i < len; i++) {
post = posts[i];
jsonPost = createJson(header, post);
jsonPosts.push(jsonPost);
}
return jsonPosts;
}
function parseCsv (csvPosts, authors) {
var jsonPosts = transformCsvToJson(csvPosts);
var tmpAuthors = [];
var author;
for(var i = 0, len = jsonPosts.length; i < len; i++) {
stats.authors.total += 1;
author = jsonPosts[i].author;
if(!author) {
stats.authors.deleted += 1;
continue;
}
if(!~authors.indexOf(author)) {
authors.push(author);
tmpAuthors.push(author);
stats.authors.unique += 1;
}
}
console.log('\n> WRITE : len=' + tmpAuthors.length + ' new authors.');
config.output.write(tmpAuthors.join(', ') + '\n');
}
function readCsv (data_dir, authors) {
var filenames = fs.readdirSync('./' + data_dir);
var counter = 0;
async.eachSeries(filenames, function (filename, cb) {
csv()
.from.path(__dirname + '/' + data_dir + '/' + filename, {
delimiter: ',',
escape: '"'
})
.to.array(function (posts) {
parseCsv(posts, authors);
counter += 1;
if(counter % 10 === 0) {
console.log(counter + '/2501');
}
cb(null);
});
}, function (e) {
config.output.end();
if(e) {
return console.log(new Error(e));
}
console.log('\n> RESULTS :');
console.log(authors.length);
console.log(stats);
});
}
// Config
var config = {
data_dir: 'data',
output: fs.createWriteStream('./output/authors.txt')
};
var authors = [];
readCsv(config.data_dir, authors);