-
Notifications
You must be signed in to change notification settings - Fork 21
/
conductor.py
75 lines (65 loc) · 3.09 KB
/
conductor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
this script conducts the entire flow of the multivac system to date. it has the
following flow:
1. collect data
a. these data come from arxiv, springer, and pubmed in this instance, but
could be modified to include more
b. it saves the downloaded pdf's to a directory and creates a json object
for further use
2. parse data
a. the json objects that are saved from the collection step are processed
for dependencies, input (word position), and morphology (lemma) [dim]
b. it also identifies and notates equations throughout articles
3. run glove models
a. take article collection that is parsed and create glove word embeddings
b. develops both domain-general and domain-specific models
4. build the query generation (qg) network
a. uses context/answers as inputs to create questions as output
b. builds off of the domain-adapted glove models to produces robust
questions around a topic of interest (in this case, epidemiology)
5. build markov logic network (mln)
a. compile parsed dim files into trees and semantically cluster
b. produce a graphical model based on first-order logic for
"""
import argparse
from multivac.src.data.glove import glove_main
from multivac.src.data.make import collect_main
from multivac.src.data.parsing import nlp_parse_main
from multivac.src.data.qgnet import qgnet_main
from multivac.pymln.pymln import mln_main
def conduct(args_dict):
# step 1: collect data
collect_main()
# step 2:
nlp_parse_main(args_dict)
# step 3: run glove models
glove_main()
# step 4: build qg network
qgnet_main(args_dict)
# step 5: build mln
mln_main(args_dict)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Orchestrate pipeline for '
'MULTIVAC processing and modeling.')
parser.add_argument('-bp', '--nlp_bp', required=False, type=int,
help='Which document to start parsing with.')
parser.add_argument('-js', '--nlp_newjson', action='store_true',
help='Boolean; indicates whether to create new JSON '
'file for glove embedding.')
parser.add_argument('-an', '--subset', type=int, help='Number of articles '
'for MLN run.')
parser.add_argument('-pc', '--prior_num_conj', default=10, type=int,
help='Prior on number of conjunctive parts assigned to '
'same cluster in MLN.')
parser.add_argument('-pp', '--prior_num_param', default=5, type=int,
help='Prior on number of parameters for cluster '
'merges.')
parser.add_argument('-qp', '--qgnet_path', required=True, help='The '
'top-level qgnet directory to create folders for '
'models and data.')
parser.add_argument('-v', "--verbose", action='store_true', help='Give '
'verbose output during MLN modeling.')
args_dict = vars(parser.parse_args())
conduct(args_dict)