-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
154 lines (104 loc) · 5.11 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import datetime
import re
import argparse
from urllib.parse import urljoin
import pandas as pd
import requests
from bs4 import BeautifulSoup
import simplejson as json
# プログラム引数解析
ap = argparse.ArgumentParser()
ap.add_argument("--output", "-o", default="./data/data.json")
ap.add_argument('--deaths', "-d", type=int, default=0)
args = ap.parse_args()
JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)
dt_update = dt_now.strftime("%Y/%m/%d %H:%M")
data = {"lastUpdate": dt_update}
# データラングリング
url = "http://www.pref.tochigi.lg.jp/e04/welfare/hoken-eisei/kansen/hp/coronakensahasseijyoukyou.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
# 新型コロナウイルス感染症検査件数
# inspections_summary
tag_kensa = soup.find("a", text=re.compile("^新型コロナウイルス感染症検査件数.+エクセル"))
link_kensa = urljoin(url, tag_kensa.get("href"))
df_kensa = pd.read_excel(link_kensa, header=[1, 2])
df_kensa.columns = df_kensa.columns.to_flat_index()
df_kensa.rename(columns={("検査日", "Unnamed: 0_level_1"): "検査日"}, inplace=True)
flg_is_serial = df_kensa["検査日"].astype('str').str.isdigit()
# 日付のシリアルと文字対応
if flg_is_serial.sum():
fromSerial = pd.to_datetime(df_kensa.loc[flg_is_serial, "検査日"].astype(float), unit="D", origin=pd.Timestamp("1899/12/30"))
fromString = pd.to_datetime(df_kensa.loc[~flg_is_serial, "検査日"])
df_kensa["検査日"] = pd.concat([fromString, fromSerial])
df_kensa.set_index("検査日", inplace=True)
df_kensa = df_kensa.astype("Int64").fillna(0)
df_kensa.sort_index(inplace=True)
df_kensa["日付"] = df_kensa.index.strftime("%Y-%m-%d")
# 委託分を合算
df_kensa[("検査件数", "栃木県")] += df_kensa[("検査件数", "県委託分")]
df_kensa[("検査件数", "宇都宮市")] += df_kensa[("検査件数", "市委託分")]
df_insp_sum = df_kensa.loc[:, ["日付", ("検査件数", "栃木県"), ("検査件数", "宇都宮市")]]
data["inspections_summary"] = {
"data": df_insp_sum.values.tolist(),
"date": dt_update,
}
# 栃木県における新型コロナウイルス感染症の発生状況一覧
tag_kanja = soup.find("a", text=re.compile("^栃木県における新型コロナウイルス感染症の発生状況一覧.+エクセル"))
link_kanja = urljoin(url, tag_kanja.get("href"))
df_kanja = pd.read_excel(link_kanja, header=1, skipfooter=1)
df_kanja.rename(columns={"退院・退所日": "退院日", "備考(No.は症例番号)": "備考"}, inplace=True)
# 備考内に削除がある場合は除外
df_kanja["備考"] = df_kanja["備考"].fillna("").astype(str).str.strip()
df_kanja = df_kanja[~df_kanja["備考"].str.contains("削除")]
df_kanja.dropna(subset=["番号"], inplace=True)
df_kanja["番号"] = df_kanja["番号"].astype(int)
df_kanja["陽性確認日"] = df_kanja["陽性確認日"].apply(lambda date: pd.to_datetime(date, unit="D", origin=pd.Timestamp("1899/12/30")))
df_kanja["退院日"] = pd.to_numeric(df_kanja["退院日"], errors='coerce')
df_kanja["退院日"] = df_kanja["退院日"].apply(lambda date: pd.to_datetime(date, unit="D", origin=pd.Timestamp("1899/12/30")))
df_kanja["退院"] = df_kanja["退院日"].dt.strftime("%Y-%m-%d")
df_kanja["状態"] = "入院中"
df_kanja["状態"] = df_kanja["状態"].where(df_kanja["退院日"].isnull(), "退院")
## main_summary
sr_situ = df_kanja["状態"].value_counts()
sr_situ = sr_situ.reindex(["入院中", "退院", "死亡"], fill_value=0)
data["main_summary"] = {
"attr": "検査実施人数",
"value": int(df_kensa[("累積検査件数", "合計")].tail(1)),
"children": [
{
"attr": "陽性患者数",
"value": len(df_kanja),
"children": [
{"attr": "入院中", "value": int(sr_situ["入院中"])},
{"attr": "退院", "value": int(sr_situ["退院"]) - args.deaths},
{"attr": "死亡", "value": int(sr_situ["死亡"]) + args.deaths},
],
}
],
}
## patients
df_kanja["リリース日"] = df_kanja["陽性確認日"].dt.strftime("%Y-%m-%d")
df_patients = df_kanja.loc[:, ["番号", "リリース日", "居住地", "年代", "性別", "退院"]]
data["patients"] = {
"data": df_patients.to_dict(orient="records"),
"date": dt_update,
}
## patients_summary
ser_patients_sum = df_kanja["陽性確認日"].value_counts().sort_index()
if df_kensa.index[-1] > ser_patients_sum.index[-1]:
ser_patients_sum[df_kensa.index[-1]] = 0
ser_patients_sum.sort_index(inplace=True)
df_patients_sum = pd.DataFrame({"小計": ser_patients_sum.asfreq("D", fill_value=0)})
df_patients_sum["日付"] = df_patients_sum.index.strftime("%Y-%m-%d")
data["patients_summary"] = {
"data": df_patients_sum.loc[:, ["日付", "小計"]].values.tolist(),
"date": dt_update,
}
with open(args.output, "w", encoding="utf-8") as fw:
json.dump(data, fw, ignore_nan=True, ensure_ascii=False, indent=4)