-
Notifications
You must be signed in to change notification settings - Fork 46
/
GTFSserverfunctions.py
1679 lines (1371 loc) · 59.9 KB
/
GTFSserverfunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
'''
GTFSserverfunctions.py
this file is to be inline included in the main script. Seriously, I do not want to keep declaring import statements everywhere.
import tornado.web
import tornado.ioloop
import json
import os
import time, datetime
import xmltodict
import pandas as pd
from collections import OrderedDict
import zipfile, zlib
from tinydb import TinyDB, Query
from tinydb.operations import delete
import webbrowser
from Cryptodome.PublicKey import RSA #uses pycryptodomex package.. disambiguates from pycrypto, pycryptodome
import shutil # used in fareChartUpload to fix header if changed
import pathlib
from math import sin, cos, sqrt, atan2, radians # for lat-long distance calculations
# import requests # nope, not needed for now
from json.decoder import JSONDecodeError # used to catch corrupted DB file when tinyDB loads it.
import signal, sys # for catching Ctrl+C and exiting gracefully.
import gc # garbage collector, from https://stackoverflow.com/a/1316793/4355695
import csv
import numpy as np
import io # used in hyd csv import
# to do: how to get these variables declared in the other file to be recognized here?
global uploadFolder
global xmlFolder
global logFolder
global configFolder
global dbFolder
global exportFolder
global sequenceDBfile
global passwordFile
global chunkRulesFile
global configFile
if __name__ == "__main__":
print("Don't run this, run GTFSManager.py.")
'''
def csvwriter( array2write, filename, keys=None ):
# 15.4.18: Changing to use pandas instead of csv.DictWriter. Solves https://github.com/WRI-Cities/static-GTFS-manager/issues/3
df = pd.DataFrame(array2write)
df.to_csv(filename, index=False, columns=keys)
logmessage( 'Created', filename )
def exportGTFS (folder):
# create commit folder
if not os.path.exists(folder):
os.makedirs(folder)
else:
returnmessage = 'Folder with same name already exists: ' + folder + '. Please choose a different commit name.'
return returnmessage
# let's zip them!
zf = zipfile.ZipFile(folder + 'gtfs.zip', mode='w')
# find .h5 files.. non-chunk ones first
filenames = findFiles(dbFolder, ext='.h5', chunk='n')
print(filenames)
for h5File in filenames:
start1 = time.time()
tablename = h5File[:-3] # remove last 3 chars, .h5
try:
df = pd.read_hdf(dbFolder + h5File).fillna('').astype(str)
except (KeyError, ValueError) as e:
df = pd.DataFrame()
logmessage('Note: {} does not have any data.'.format(h5File))
if len(df):
logmessage('Writing ' + tablename + ' to disk and zipping...')
df.to_csv(folder + tablename + '.txt', index=False, chunksize=1000000)
del df
zf.write(folder + tablename + '.txt' , arcname=tablename + '.txt', compress_type=zipfile.ZIP_DEFLATED )
else:
del df
logmessage(tablename + ' is empty so not exporting that.')
end1 = time.time()
logmessage('Added {} in {} seconds.'.format(tablename,round(end1-start1,3)))
gc.collect()
# Now, process chunk files.
for tablename in list(chunkRules.keys()):
start1 = time.time()
filenames = findFiles(dbFolder, ext='.h5', prefix=tablename)
if not len(filenames): continue #skip if no files
print('Processing chunks for {}: {}'.format(tablename,list(filenames)) )
# first, getting all columns
columnsList = set()
for count,h5File in enumerate(filenames):
try:
df = pd.read_hdf(dbFolder + h5File,stop=0)
except (KeyError, ValueError) as e:
df = pd.DataFrame()
logmessage('Note: {} does not have any data.'.format(h5File))
columnsList.update(df.columns.tolist())
del df
gc.collect()
columnsList = list(columnsList)
# moving the main ID to first position
# from https://stackoverflow.com/a/1014544/4355695
IDcol = chunkRules[tablename]['key']
columnsList.insert(0, columnsList.pop(columnsList.index(IDcol)))
logmessage('Columns for {}: {}'.format(tablename,list(columnsList)))
for count,h5File in enumerate(filenames):
logmessage('Writing {} to csv'.format(h5File))
try:
df1 = pd.read_hdf(dbFolder + h5File).fillna('').astype(str)
except (KeyError, ValueError) as e:
df1 = pd.DataFrame()
logmessage('Note: {} does not have any data.'.format(h5File))
# in case the final columns list has more columns than df1 does, concatenating an empty df with the full columns list.
# from https://stackoverflow.com/a/30926717/4355695
columnsetter = pd.DataFrame(columns=columnsList)
df2 = pd.concat([df1,columnsetter], ignore_index=True, copy=False, sort=False)[columnsList]
# adding [columnsList] so the ordering of columns is strictly maintained between all chunks
appendFlag,headerFlag = ('w',True) if count == 0 else ('a',False)
# so at first loop, we'll create a new one and include column headers.
# In subsequent loops we'll append and not repeat the column headers.
df2.to_csv(folder + tablename + '.txt', mode=appendFlag, index=False, header=headerFlag, chunksize=10000)
del df2
del df1
gc.collect()
mid1 = time.time()
logmessage('CSV {} created in {} seconds, now zipping'.format(tablename + '.txt',round(mid1-start1,3)))
zf.write(folder + tablename +'.txt' , arcname=tablename +'.txt', compress_type=zipfile.ZIP_DEFLATED )
end1 = time.time()
logmessage('Added {} to zip in {} seconds.'.format(tablename,round(end1-mid1,3)))
logmessage('Added {} in {} seconds.'.format(tablename,round(end1-start1,3)))
zf.close()
gc.collect()
logmessage('Generated GTFS feed at {}'.format(folder))
returnmessage = '<p>Success! Generated GTFS feed at <a href="' + folder + 'gtfs.zip' + '">' + folder + 'gtfs.zip<a></b>. Click to download.</p><p>You can validate the feed on <a href="https://gtfsfeedvalidator.transitscreen.com/" target="_blank">GTFS Feed Validator</a> website.</p>'
return returnmessage
def importGTFS(zipname):
start1 = time.time()
# take backup first
if not debugMode:
backupDB() # do when in production, skip when developing / debugging
# unzip imported zip
# make a separate folder to unzip in, so that when importing we don't end up picking other .txt files that happen to be in the general uploads folder.
unzipFolder = uploadFolder + '{:unzip-%H%M%S}/'.format(datetime.datetime.now())
if not os.path.exists(unzipFolder):
os.makedirs(unzipFolder)
fileToUnzip = uploadFolder + zipname
logmessage('Extracting uploaded zip to {}'.format(unzipFolder))
# UNZIP a zip file, from https://stackoverflow.com/a/36662770/4355695
with zipfile.ZipFile( fileToUnzip,"r" ) as zf:
zf.extractall(unzipFolder)
# loading names of the unzipped files
# scan for txt files, non-recursively, only at folder level. from https://stackoverflow.com/a/22208063/4355695
filenames = [f for f in os.listdir(unzipFolder) if f.lower().endswith('.txt') and os.path.isfile(os.path.join(unzipFolder, f))]
logmessage('Extracted files: ' + str(list(filenames)) )
if not len(filenames):
return False
# Check if essential files are there or not.
# ref: https://developers.google.com/transit/gtfs/reference/#feed-files
# using set and subset. From https://stackoverflow.com/a/16579133/4355695
# hey we need to be ok with incomplete datasets, the tool's purpose is to complete them!
if not set(requiredFeeds).issubset(filenames):
logmessage('Note: We are importing a GTFS feed that does not contain all the required files as per GTFS spec: %s \
Kindly ensure the necessary files get created before exporting.' % str(list(requiredFeeds)))
# purge the DB. We're doing this only AFTER the ZIPfile is successfully uploaded and unzipped and tested.
purgeDB()
logmessage('Commencing conversion of gtfs feed files into the DB\'s .h5 files')
for txtfile in filenames:
tablename = txtfile[:-4]
# using Pandas to read the csv and write it as .h5 file
if not chunkRules.get(tablename,None):
# normal files that don't need chunking
df = pd.read_csv(unzipFolder + txtfile ,dtype=str, na_values='')
# na_filter=False to read blank cells as empty strings instead of NaN. from https://stackoverflow.com/a/45194703/4355695
# reading ALL columns as string, and taking all NA values as blank string
if not len(df):
# skip the table if it's empty.
print('Skipping',tablename,'because its empty')
continue
h5File = tablename.lower() + '.h5'
logmessage('{}: {} rows'.format(h5File, str(len(df)) ) )
df.to_hdf(dbFolder+h5File, 'df', format='table', mode='w', complevel=1)
# if there is no chunking rule for this table, then make one .h5 file with the full table.
del df
gc.collect()
else:
# let the chunking commence
logmessage('Storing {} in chunks.'.format(tablename))
chunkSize = chunkRules[tablename].get('chunkSize',200000)
IDcol = chunkRules[tablename].get('key')
fileCounter = 0
lookupJSON = OrderedDict()
carryOverChunk = pd.DataFrame()
for chunk in pd.read_csv(unzipFolder + txtfile, chunksize=chunkSize, dtype=str, na_values=''):
# see if can use na_filter=False to speed up
if not len(chunk):
# skip the table if it's empty.
# there's probably going to be only one chunk if this is true
print('Skipping',tablename,'because its empty')
continue
# zap the NaNs at chunk level
chunk = chunk.fillna('')
IDList = chunk[IDcol].unique().tolist()
# print('first ID: ' + IDList[0])
# print('last ID: ' + IDList[-1])
workChunk = chunk[ chunk[IDcol].isin(IDList[:-1]) ]
if len(carryOverChunk):
workChunk = pd.concat([carryOverChunk, workChunk],ignore_index=True, sort=False)
carryOverChunk = chunk[ chunk[IDcol] == IDList[-1] ]
fileCounter += 1
h5File = tablename + '_' + str(fileCounter) + '.h5' # ex: stop_times_1.h5
logmessage('{}: {} rows'.format(h5File, str(len(workChunk)) ) )
workChunk.to_hdf(dbFolder+h5File, 'df', format='table', mode='w', complevel=1)
del workChunk
gc.collect()
# making lookup table
for x in IDList[:-1]:
if lookupJSON.get(x,None):
logmessage('WARNING: {} may not have been sorted properly. Encountered a repeat instance of {}={}'
.format(txtfile,IDcol,x))
lookupJSON[x] = h5File
# chunk loop over.
del chunk
# Now append the last carry-over chunk in to the last chunkfile
logmessage('Appending the {} rows of last ID to last chunk {}'
.format(str(len(carryOverChunk)),h5File))
carryOverChunk.to_hdf(dbFolder+h5File, 'df', format='table', append=True, mode='a', complevel=1)
# need to set append=True to tell it to append. mode='a' is only for file-level.
# add last ID to lookup
lookupJSON[ IDList[-1] ] = h5File
del carryOverChunk
gc.collect()
lookupJSONFile = chunkRules[tablename].get('lookup','lookup.json')
with open(dbFolder + lookupJSONFile, 'w') as outfile:
json.dump(lookupJSON, outfile, indent=2)
# storing lookup json
logmessage('Lookup json: {} created for mapping ID {} to {}_n.h5 chunk files.'.format(lookupJSONFile,IDcol,tablename))
logmessage('Finished importing GTFS feed. You can remove the feed zip {} and folder {} from {} if you want.'.format(zipname,unzipFolder,uploadFolder))
return True
def GTFSstats():
'''
Gives current stats of the GTFS tables held in DB
Enlists:
- agency name(s).
- mandatory GTFS tables
- optional GTFS tables
- extra tables present in feed but not part of traditional GTFS spec (only mentioned if present)
- List number of entries in each
- Pad to have tabular like view
- Format numbers to have thousands separators
- If there are excess agencies, mention only first two and then put number of remaining
'''
content = '';
agencyDF = readTableDB('agency')
if len(agencyDF):
agencyList = agencyDF.agency_name.tolist()
if len(agencyList)>2 : agencyList[:] = agencyList[:2] + ['and {} more'.format(len(agencyList)-2 )]
# if there are excess agencies, mention only first two and then put number of remaining
content += 'Agency: {}<br>'.format( ', '.join(agencyList) )
else:
content += 'Agency: none found.<br>'
filenames = findFiles(dbFolder, ext='.h5', prefix=None, chunk='all')
coveredFiles = []
# first, run through the main GTFS files in proper order
content += '<br>1. Main tables: (*)<br>'
for feed in requiredFeeds:
tablename = feed[:-4] # remove .txt
count = 0
if tablename not in chunkRules.keys():
# normal tables
if os.path.exists(dbFolder+tablename+'.h5'):
hdf = pd.HDFStore(dbFolder + tablename + '.h5')
try:
count = hdf.get_storer('df').nrows
# gets number of rows, without reading the entire file into memory. From https://stackoverflow.com/a/26466301/4355695
except (KeyError, ValueError) as e:
logmessage('Note: {} does not have any data.'.format(tablename + '.h5'))
hdf.close()
# have to close this opened file, else will conflict with pd.read_csv later on
coveredFiles.append(tablename+'.h5')
message = '{}: {:,} entries'.format( tablename.ljust(20),count )
# {:,} : does number formattting. from https://stackoverflow.com/q/16670125/4355695
# .ljust(20): pads spaces to string so that total len=20. from https://stackoverflow.com/a/5676676/4355695
logmessage(message)
content += message + '<br>'
else:
# chunked files
chunks = findFiles(dbFolder, ext='.h5', prefix=tablename, chunk='y')
if chunks:
for h5File in chunks:
hdf = pd.HDFStore(dbFolder + h5File)
try:
count += hdf.get_storer('df').nrows
except (KeyError, ValueError) as e:
logmessage('Note: {} does not have any data.'.format(h5File))
hdf.close()
coveredFiles.append(h5File)
message = '{}: {:,} entries'.format( tablename.ljust(20),count )
logmessage(message)
content += message + '<br>'
# requiredFeeds loop over
# next, cover optional tables in GTFS spec
content += '<br>2. Additional tables: (#)<br>'
for feed in optionalFeeds:
tablename = feed[:-4] # remove .txt
count = 0
if tablename not in chunkRules.keys():
# normal tables
if os.path.exists(dbFolder+tablename+'.h5'):
hdf = pd.HDFStore(dbFolder + tablename + '.h5')
try:
count = hdf.get_storer('df').nrows
except (KeyError, ValueError) as e:
logmessage('Note: {} does not have any data.'.format(tablename + '.h5'))
hdf.close()
coveredFiles.append(tablename+'.h5')
message = '{}: {:,} entries'.format( tablename.ljust(20),count )
logmessage(message)
content += message + '<br>'
else:
# chunked files
chunks = findFiles(dbFolder, ext='.h5', prefix=tablename, chunk='y')
if chunks:
for h5File in chunks:
hdf = pd.HDFStore(dbFolder + h5File)
try:
count += hdf.get_storer('df').nrows
except (KeyError, ValueError) as e:
logmessage('Note: {} does not have any data.'.format(h5File))
hdf.close()
coveredFiles.append(h5File)
message = '{}: {:,} entries'.format( tablename.ljust(20),count )
logmessage(message)
content += message + '<br>'
# optionalFeeds loop over
# now we cover the files that are present in the feed but not part of the GTFS spec
remainingFiles = set(filenames) - set(coveredFiles)
if(remainingFiles) : content += '<br>3. Other tables: (^)<br>'
for h5File in remainingFiles:
hdf = pd.HDFStore(dbFolder + h5File)
try:
count = hdf.get_storer('df').nrows
except (KeyError, ValueError) as e:
logmessage('Note: {} does not have any data.'.format(h5File))
count = 0
hdf.close()
message = '{}: {:,} entries'.format( h5File[:-3].ljust(20),count )
logmessage(message)
content += message + '<br>'
# Footnotes
content += '<br>----<br>*: required part of GTFS spec, needed to make valid GTFS'
content += '<br>#: part of GTFS spec but not compulsory'
if(remainingFiles) : content += '<br>^: not part of traditional GTFS spec, used by operator for additional purposes'
return content
# end of GTFSstats function
def readTableDB(tablename, key=None, value=None):
'''
main function for reading a table or part of it from the DB
read-only
note: this does not handle non-primary keys for chunked tables. - let's change that!
'''
# if tablename is a blank string, return empty array.
if not len(tablename):
return pd.DataFrame()
if tablename not in chunkRules.keys():
# not a chunked file
h5Files = [tablename + '.h5']
else:
# if it's a chunked file
if key == chunkRules[tablename].get('key'):
h5File = findChunk(value, tablename)
if not h5File:
logmessage('readTableDB: No {} chunk found for key={}'.format(tablename,value))
h5Files = []
else: h5Files = [h5File]
else:
h5Files = findFiles(dbFolder, ext='.h5', prefix=tablename, chunk='y')
# so now we have array/list h5Files having one or more .h5 files to be read.
collectDF = pd.DataFrame()
for h5File in h5Files:
# check if file exists.
if not os.path.exists(dbFolder+h5File):
continue
try:
df = pd.read_hdf(dbFolder + h5File).fillna('').astype(str)
# typecasting as str, keeping NA values blank ''
except (KeyError, ValueError) as e:
df = pd.DataFrame()
logmessage('Note: {} does not have any data.'.format(h5File))
if(key and value):
logmessage('readTableDB: table:{}, column:{}, value:"{}"'.format(tablename,key,value))
# check if this column is present or not
if key not in df.columns:
logmessage('readTableDB: Error: column {} not found in {}. Skipping it.'.format(key,h5File) )
continue
df.query('{} == "{}"'.format(key,value), inplace=True)
# note: in case the column (key) has a space, see https://github.com/pandas-dev/pandas/issues/6508. Let's avoid spaces in column headers please!
# dilemma: what if the value is a number instead of a string? let's see that happens!
# -> solved by typecasting everything as str by default
collectDF = collectDF.append(df.copy(), ignore_index=True, sort=False)
del df
logmessage('readTableDB: Loaded {}, {} records'.format(tablename,len(collectDF)) )
return collectDF
def replaceTableDB(tablename, data, key=None, value=None):
# new Data
xdf = pd.DataFrame(data).fillna('').astype(str)
# type-casting everything as string only, it's safer. See https://github.com/WRI-Cities/static-GTFS-manager/issues/82
if value is not None:
value = str(value)
# fork out if it's stop_times or other chunked table
if tablename in chunkRules.keys():
# we do NOT want to come here from the replaceID() function. That should be handled separately.
# Here, do only if it's coming from the actual data editing side.
if value is None or key != chunkRules[tablename]['key']:
# NOPE, not happening! for chunked table, value HAS to be a valid id.
logmessage('Invalid key-value pair for chunked table',tablename,':',key,'=',value)
del xdf
gc.collect()
return False
chunkyStatus = replaceChunkyTableDB(xdf, value, tablename)
del xdf
gc.collect()
return chunkyStatus
# fork over, now back to regular
h5File = tablename + '.h5'
# if file doesn't exist (ie, brand new data), make a new .h5 with the data and scram
if not os.path.exists(dbFolder+h5File):
xdf.to_hdf(dbFolder+h5File, 'df', format='table', mode='w', complevel=1)
logmessage('DB file for {} not found so created with the new data.'.format(tablename))
# else proceed if file exists
elif ((key is not None) and (value is not None) ):
# remove entries matching the key and value
try:
df = pd.read_hdf(dbFolder+h5File).fillna('').astype(str)
oldLen = len( df[ df[key] == str(value)])
df.query(key + ' != "' + str(value) + '"', inplace=True)
except (KeyError, ValueError) as e:
df = pd.DataFrame()
logmessage('Note: {} does not have any data.'.format(h5File))
oldLen = 0
df3 = pd.concat([df,xdf], ignore_index=True, sort=False)
df3.to_hdf(dbFolder+h5File, 'df', format='table', mode='w', complevel=1)
logmessage('Replaced {} entries for {}={} with {} new entries in {}.'\
.format(oldLen,key,str(value),str(len(xdf)),tablename ) )
del df3
del df
else:
# directly replace whatever's there with new data.
xdf.to_hdf(dbFolder+h5File, 'df', format='table', mode='w', complevel=1)
logmessage('Replaced {} with new data, {} entries inserted.'.format(tablename,str(len(data)) ) )
del xdf
gc.collect()
return True
def sequenceSaveDB(sequenceDBfile, route_id, data, shapes=None):
'''
save onward and return stops sequence for a route
'''
dataToUpsert = {'route_id': route_id, '0': data[0], '1': data[1] }
if shapes:
if len(shapes[0]):
dataToUpsert.update({ 'shape0':shapes[0] })
if len(shapes[1]):
dataToUpsert.update({ 'shape1':shapes[1] })
# add shapes names to sequence DB only if they are valid shape names, not if they are blank strings.
# solves part of https://github.com/WRI-Cities/static-GTFS-manager/issues/38
db = tinyDBopen(sequenceDBfile)
Item = Query()
status = True
try:
db.upsert( dataToUpsert, Item['route_id'] == route_id )
except:
status = False
db.close()
return status
def sequenceReadDB(sequenceDBfile, route_id):
db = tinyDBopen(sequenceDBfile)
Item = Query()
'''
check = db.contains(Item['route_id'] == route_id)
if not check:
db.close()
return False
'''
sequenceItem = db.search(Item['route_id'] == route_id)
db.close()
if sequenceItem == []:
return False
sequenceArray = [ sequenceItem[0]['0'], sequenceItem[0]['1'] ]
logmessage('Got the sequence from sequence db file.')
return sequenceArray
def sequenceFull(sequenceDBfile, route_id):
# 20.4.18 : writing this to pass on shapes data too. in future, change things on JS end and merge the sequenceReadDB function with this.
db = tinyDBopen(sequenceDBfile)
Item = Query()
sequenceItem = db.search(Item['route_id'] == route_id)
db.close()
if sequenceItem == []:
return False
sequenceArray = sequenceItem[0]
logmessage('Got the sequence from sequence db file.')
return sequenceArray
def extractSequencefromGTFS(route_id):
# idea: scan for the first trip matching a route_id, in each direction, and get its sequence from stop_times.
# In case it hasn't been provisioned yet in stop_times, will return empty arrays.
tripsdf = readTableDB('trips', key='route_id', value=route_id)
if not len(tripsdf):
logmessage('extractSequencefromGTFS: no trips found for {}. Skipping.'.format(route_id))
return [ [], [] ]
if 'direction_id' not in tripsdf.columns:
logmessage('extractSequencefromGTFS: Trips table doesn\'t have any direction_id column. Well, its optional.. taking the first trip only for route {}.'.format(route_id))
oneTrip0 = tripsdf.iloc[0].trip_id
oneTrip1 = None
else:
dir0df = tripsdf[ tripsdf.direction_id == '0'].copy().reset_index(drop=True).trip_id
oneTrip0 = dir0df.iloc[0] if len(dir0df) else tripsdf.iloc[0].trip_id
# using first trip's id as default, for cases where direction_id is blank.
dir1df = tripsdf[ tripsdf.direction_id == '1'].copy().reset_index(drop=True).trip_id
oneTrip1 = dir1df.iloc[0] if len(dir1df) else None
# reset_index: re-indexes as 0,1,... from https://stackoverflow.com/a/20491748/4355695
del dir0df
del dir1df
del tripsdf
if oneTrip0:
array0 = readColumnDB('stop_times','stop_id', key='trip_id', value=oneTrip0)
logmessage('extractSequencefromGTFS: Loading sequence for route {}, onward direction from trip {}:\n{}'.format(route_id,oneTrip0,str(list(array0[:50])) ))
else:
array0 = []
logmessage('No onward sequence found for route {}'.format(route_id))
if oneTrip1:
array1 = readColumnDB('stop_times','stop_id', key='trip_id', value=oneTrip1)
logmessage('extractSequencefromGTFS: Loading sequence for route {}, return direction from trip {}:\n{}'.format(route_id,oneTrip1,str(list(array1[:50])) ))
else:
array1 = []
logmessage('No return sequence found for route {}'.format(route_id))
sequence = [array0, array1]
return sequence
def uploadaFile(fileholder):
# adapted from https://techoverflow.net/2015/06/09/upload-multiple-files-to-the-tornado-webserver/
# receiving a form file object as argument.
# saving to uploadFolder. In case same name file already exists, over-writing.
filename = fileholder['filename'].replace("/", "")
# zapping folder redirections if any
logmessage('Saving filename: ' + filename + ' to ' + uploadFolder)
if not os.path.exists(uploadFolder):
os.makedirs(uploadFolder)
with open(uploadFolder+filename, "wb") as out:
# Be aware, that the user may have uploaded something evil like an executable script ...
# so it is a good idea to check the file content (xfile['body']) before saving the file
out.write(fileholder['body'])
return filename
###########################
def diagnoseXMLs(weekdayXML, sundayXML, depot=None) :
try:
weekdayReport = '<p>Weekday XML: <a target="_blank" href="' + uploadFolder + weekdayXML + '">' + weekdayXML + '↗</a></p>'
sundayReport = '<p>Sunday XML: <a target="_blank" href="' + uploadFolder + sundayXML + '">' + sundayXML + '↗</a></p>'
weekdaySchedules = []
sundaySchedules = []
fullStopsList = set()
# depot trip checking:
dropDepotTrips = 0
if depot:
depotsList = depot.split(',')
else:
depotsList = []
logmessage('Depot stations: ' + str(depotsList) )
# logic: if first stop or last stop is in depotsList, then increment dropDepotTrips counter.
# 1. before processing XMLs, lets get the mapped stops list from the resident stations.csv
mappedStopsList = readStationsCSV(xmlFolder + 'stations.csv')
# 2. Loading Weekday XML file.
with open( uploadFolder + weekdayXML , encoding='utf8' ) as fd:
fileholder = xmltodict.parse(fd.read(), attr_prefix='')
# trips_from_xml = fileholder['ROOT']['SCHEDULE']['TRIPS']['TRIP']
scheduleHolder = fileholder['ROOT']['SCHEDULE']
# whether the node is single or repeating in the XML, convert it so that it becomes a list to iterate through
if type(scheduleHolder) == type(OrderedDict()) :
scheduleHolder = [scheduleHolder]
# this makes a single schedule compatible with multiple schedule entries in xml
logmessage(str(len(scheduleHolder)) + ' route(s) found in ' + weekdayXML)
for schedule in scheduleHolder:
schedule_name = schedule['NAME']
stopsList = set()
directions = set()
vehicles = set()
timesList = set()
for trip in schedule['TRIPS']['TRIP']:
timesList.add(trip['ENTRY_TIME'])
directions.add(trip['DIRECTION'])
vehicles.add(trip['SERVICE_ID'])
# check if first or last stop is in depotsList
if (trip['STOP'][0]['TOP'] in depotsList) or ( trip['STOP'][-1]['TOP'] in depotsList ):
dropDepotTrips += 1
for stop in trip['STOP']:
stopsList.add(stop['TOP'])
fullStopsList.update(stopsList)
# sorting: https://www.tutorialspoint.com/python/list_sort.htm
sortedTimesList = list(timesList)
sortedTimesList.sort()
weekdayReport += '<p><b>Schedule: ' + schedule_name + '</b>'
weekdayReport += '<br>Trips: ' + str( len( schedule['TRIPS']['TRIP'] ))
weekdayReport += '<br>Vehicles: ' + str( len( vehicles ))
weekdayReport += '<br>Directions: ' + str( len( directions ))
weekdayReport += '<br>First trip: ' + sortedTimesList[0]
weekdayReport += '<br>Last trip: ' + sortedTimesList[-1] + '</p>'
weekdaySchedules.append(schedule_name)
################
# 3. Loading Sunday XML file.
with open( uploadFolder + sundayXML , encoding='utf8' ) as fd:
fileholder = xmltodict.parse(fd.read(), attr_prefix='')
# trips_from_xml = fileholder['ROOT']['SCHEDULE']['TRIPS']['TRIP']
scheduleHolder = fileholder['ROOT']['SCHEDULE']
# whether the node is single or repeating in the XML, convert it so that it becomes a list to iterate through
if type(scheduleHolder) == type(OrderedDict()) :
scheduleHolder = [scheduleHolder]
# this makes a single schedule compatible with multiple schedule entries in xml
logmessage(str(len(scheduleHolder)) + ' route(s) found in ' + sundayXML)
for schedule in scheduleHolder:
schedule_name = schedule['NAME']
stopsList = set()
directions = set()
vehicles = set()
timesList = set()
for trip in schedule['TRIPS']['TRIP']:
timesList.add(trip['ENTRY_TIME'])
directions.add(trip['DIRECTION'])
vehicles.add(trip['SERVICE_ID'])
# check if first or last stop is in depotsList
if (trip['STOP'][0]['TOP'] in depotsList) or ( trip['STOP'][-1]['TOP'] in depotsList ):
dropDepotTrips += 1
for stop in trip['STOP']:
stopsList.add(stop['TOP'])
fullStopsList.update(stopsList)
sortedTimesList = list(timesList)
sortedTimesList.sort()
sundayReport += '<p><b>Schedule: ' + schedule_name + '</b>'
sundayReport += '<br>Trips: ' + str( len( schedule['TRIPS']['TRIP'] ))
sundayReport += '<br>Vehicles: ' + str( len( vehicles ))
sundayReport += '<br>Directions: ' + str( len( directions ))
sundayReport += '<br>First trip: ' + sortedTimesList[0]
sundayReport += '<br>Last trip: ' + sortedTimesList[-1] + '</p>'
sundaySchedules.append(schedule_name)
############
# 4. Calculate missing stops and write verbose.
check = len(fullStopsList - mappedStopsList)
if not check:
missingStopsReport = '<p><font color="green"><b><font size="6">✔</font> All internal stops are mapped!</b></font><br>We are good to proceed to step 3.</p>';
stationsStatus = missingStopsReport
allStopsMappedFlag = True
else :
missingListing = ''
for item in (fullStopsList - mappedStopsList):
missingListing += '<li>' + item + '</li>'
missingStopsReport = '<p><font color="red"><b><font size="5">✘</font> ' + str(check) + ' stop(s) are missing</b></font> from the stations mapping list.<br>Proceed to step 2, and ensure that the following internal station names are present under either <b><i>up_id</i></b> or <b><i>down_id</i></b> columns, with the corresponding columns filled properly:<br><b><ul>' + missingListing + '</ul></b></p>'
stationsStatus = '<p><font color="red"><b><font size="5">✘</font> ' + str(check) + ' stop(s) are missing</b></font> from the stations mapping list.<br>Ensure that the following internal station names are present under either <b><i>up_id</i></b> or <b><i>down_id</i></b> columns, with the corresponding columns filled in properly:<br><b><ul>' + missingListing + '</ul></b></p>'
allStopsMappedFlag = False
#########
# 5. putting the report together in HTML
diagBox = '<small><div class="row"><div class="col">' + weekdayReport + '</div><div class="col">' + sundayReport + '</div></div></small>' + '<hr>' + missingStopsReport
#########
# 6. Appending
dropDepotTripsText = '<div class="alert alert-warning">Note: Total <u><b>' + str(dropDepotTrips) + '</b> trips will be dropped</u> from the XMLs as they are depot trips, ie, they originate from or terminate at the depot station (chosen in step 2).</div>'
diagBox += dropDepotTripsText
stationsStatus += dropDepotTripsText
#########
# 6. Return a dict
return { 'report':diagBox, 'stationsStatus':stationsStatus, 'weekdaySchedules':weekdaySchedules,
'sundaySchedules':sundaySchedules, 'allStopsMappedFlag':allStopsMappedFlag }
except:
return False
##############################
def readStationsCSV(csvfile = xmlFolder + 'stations.csv'):
'''
This is for KMRL Metro file import
'''
stations = pd.read_csv(csvfile)
# load up_id and down_id columns, but removing blank/null values. From https://stackoverflow.com/a/22553757/4355695
upList = stations[stations['up_id'].notnull()]['up_id']
downList = stations[stations['down_id'].notnull()]['down_id']
mappedStopsList = set() # non-repeating list. Silently drops any repeating values getting added.
mappedStopsList.update( upList )
mappedStopsList.update( downList )
return mappedStopsList
##############################
def decrypt(password):
# from https://stackoverflow.com/questions/2490334/simple-way-to-encode-a-string-according-to-a-password
if len(password) == 0:
logmessage("Why u no entering password! Top right! Top right!")
return False
with open(passwordFile, "rb") as f:
encoded_key = f.read()
try:
key = RSA.import_key(encoded_key, passphrase=password)
return True
except ValueError:
return False
##############################
def csvunpivot(filename, keepcols, var_header, value_header, sortby):
# brought in from xml2GTFS functions.py
fares_pivoted = pd.read_csv(filename, encoding='utf8')
logmessage( 'Loading and unpivoting',filename)
fares_unpivoted = pd.melt(fares_pivoted, id_vars=keepcols, var_name=var_header, value_name=value_header).sort_values(by=sortby)
# rename header 'Stations' to 'origin_id', from https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas/
# and drop all rows having NaN values. from https://stackoverflow.com/a/13434501/4355695
fares_unpivoted_clean = fares_unpivoted.rename(columns={'Stations': 'origin_id'}).dropna()
# 4.9.18: returns a dataframe now
return fares_unpivoted_clean
##############################
def get_sec(time_str):
h, m, s = time_str.split(':')
return int(h) * 3600 + int(m) * 60 + int(s)
def lat_long_dist(lat1,lon1,lat2,lon2):
# function for calculating ground distance between two lat-long locations
R = 6373.0 # approximate radius of earth in km.
lat1 = radians( float(lat1) )
lon1 = radians( float(lon1) )
lat2 = radians( float(lat2) )
lon2 = radians( float(lon2) )
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = float(format( R * c , '.2f' )) #rounding. From https://stackoverflow.com/a/28142318/4355695
return distance
def intcheck(s):
s = s.strip()
return int(s) if s else ''
def tinyDBopen(filename):
# made for the event when db file is corrupted. using this instead of default db open statement will reset the file if corrupted.
try:
db = TinyDB(filename, sort_keys=True, indent=2)
except JSONDecodeError:
logmessage('tinyDBopen: DB file {} has invalid json. Making a backup copy and creating a new blank one.'.format(filename))
shutil.copy(filename, filename + '_backup') # copy file. from http://www.techbeamers.com/python-copy-file/
except FileNotFoundError:
logmessage('tinyDBopen: {} not found so creating.'.format(filename))
open(filename, 'w').close() # make a blank file. from https://stackoverflow.com/a/12654798/4355695
db = TinyDB(filename, sort_keys=True, indent=2)
return db
def geoJson2shape(route_id, shapefile, shapefileRev=None):
with open(shapefile, encoding='utf8') as f:
# loading geojson, from https://gis.stackexchange.com/a/73771/44746
data = json.load(f)
logmessage('Loaded',shapefile)
output_array = []
try:
coordinates = data['features'][0]['geometry']['coordinates']
except:
logmessage('Invalid geojson file ' + shapefile)
return False
prevlat = coordinates[0][1]
prevlon = coordinates[0][0]
dist_traveled = 0
i = 0
for item in coordinates:
newrow = OrderedDict()
newrow['shape_id'] = route_id + '_0'
newrow['shape_pt_lat'] = item[1]
newrow['shape_pt_lon'] = item[0]
calcdist = lat_long_dist(prevlat,prevlon,item[1],item[0])
dist_traveled = dist_traveled + calcdist
newrow['shape_dist_traveled'] = dist_traveled
i = i + 1
newrow['shape_pt_sequence'] = i
output_array.append(newrow.copy())
prevlat = item[1]
prevlon = item[0]
# Reverse trip now.. either same shapefile in reverse or a different shapefile
if( shapefileRev ):
with open(shapefileRev, encoding='utf8') as g:
data2 = json.load(g)
logmessage('Loaded',shapefileRev)
try:
coordinates = data2['features'][0]['geometry']['coordinates']
except:
logmessage('Invalid geojson file ' + shapefileRev)
return False
else:
coordinates.reverse()
prevlat = coordinates[0][1]
prevlon = coordinates[0][0]
dist_traveled = 0
i = 0
for item in coordinates:
newrow = OrderedDict()
newrow['shape_id'] = route_id + '_1'
newrow['shape_pt_lat'] = item[1]
newrow['shape_pt_lon'] = item[0]
calcdist = lat_long_dist(prevlat,prevlon,item[1],item[0])
dist_traveled = float(format( dist_traveled + calcdist , '.2f' ))
newrow['shape_dist_traveled'] = dist_traveled
i = i + 1
newrow['shape_pt_sequence'] = i
output_array.append(newrow.copy())
prevlat = item[1]
prevlon = item[0]
return output_array
def allShapesListFunc():
shapeIDsJson = {}
shapeIDsJson['all'] = readColumnDB('shapes','shape_id')
db = tinyDBopen(sequenceDBfile)
allSequences = db.all()
db.close()
shapeIDsJson['saved'] = { x['route_id']:[ x.get('shape0', ''), x.get('shape1','') ] for x in allSequences }
return shapeIDsJson
def serviceIdsFunc():
calendarDF = readTableDB('calendar')
collectorSet = set()
if len(calendarDF):
collectorSet.update( calendarDF['service_id'].tolist() )
# service_id_list = calendarDF['service_id'].tolist()
calendarDatesDF = readTableDB('calendar_dates')
if len(calendarDatesDF):
collectorSet.update( calendarDatesDF['service_id'].tolist() )
return list(collectorSet)
#################################################3
def replaceIDfunc(key,valueFrom,valueTo):