code-for-hamamatsu · euledge · Aug 7, 2022
diff --git a/.github/workflows/create_data_news_json.yml b/.github/workflows/create_data_news_json.yml
@@ -19,6 +19,7 @@ jobs:
       continue-on-error: true
       id: error_on_news_json
       run: |
+        pip install -r production/tool/requirements.txt
         python3 -B production/tool/create_news_json.py > news.json
         mv news.json production/data/hamamatsu/news.json
     # - name: RUN PYTHON SCRIPT(create data.json)

diff --git a/tool/create_news_json.py b/tool/create_news_json.py
@@ -7,6 +7,7 @@
 from datetime import timezone
 from datetime import timedelta
 from html.parser import HTMLParser
+from bs4 import BeautifulSoup
 
 '''
 浜松市コロナサイトのお知らせ部分からnews.jsonを作成する
@@ -34,83 +35,21 @@
 }
 '''
 
-class NewsParser(HTMLParser):
-    def __init__(self):
-        HTMLParser.__init__(self)
-        self.BASE_URL = 'https://www.city.hamamatsu.shizuoka.jp'
-        self.inContents = False
-        self.inDay = False
-        self.ulInDay = False
-        self.listInDay = False
-        self.link = False
-        self.news = []
-        self.currentDate = ''
-        self.supplement = ''
-        self.starttag = ''
-        self.endtag = ''
-
-    def handle_starttag(self, tag, attrs):
-        attrs = dict(attrs)
-        self.starttag = tag
-        # <div class="box_info_cnt">
-        if tag == "div" and "class" in attrs and attrs['class'] == "box_info_cnt":
-            self.inContents = True
-            return
-        # <li>x月y日
-        if tag == "li" and self.inContents and not self.inDay:
-            self.inDay = True
-            return
-        # <li>x月y日<ul>
-        if tag == "ul" and self.inDay:
-            self.ulInDay = True
-            return
-        # <li>x月y日<ul><li>
-        if tag == "li" and self.ulInDay:
-            self.listInDay = True
-            return
-        # <li>x月y日<ul><li><a href="xxxx.html">yyyyyyyy</a>
-        if tag == "a" and self.listInDay:
-            self.link = True
-            if attrs["href"].startswith("http"):
-                self.news.append({"date": self.currentDate,"url": attrs["href"]})
-            else:
-                self.news.append({"date": self.currentDate,"url": self.BASE_URL + attrs["href"]})
-            return
-
+def main():
+    BASE_URL = 'https://www.city.hamamatsu.shizuoka.jp'
 
-    def handle_endtag(self, tag):
-        self.endtag = tag
-        if tag == "a" and self.link:
-            self.link = False
-            return
-        if tag == "li" and self.listInDay:
-            self.listInDay = False
-            return
-        if tag == "ul" and self.ulInDay:
-            self.ulInDay = False
-            return
-        if tag == "li" and self.inDay:
-            self.inDay = False
-            return
-        if tag == "div" and self.inContents:
-            self.inContents = False
-            return
+    response = requests.get('https://www.city.hamamatsu.shizuoka.jp/koho2/emergency/korona.html')
+    response.encoding = response.apparent_encoding
 
-    def handle_data(self, data):
-        if self.listInDay and not self.link:
-            data = data.strip().rstrip("／")
-            if data and self.lasttag == 'li':
-               self.news.append({"date": self.currentDate,"url":"","text": data})
-               return
-            if data:
-               text = self.news[-1].get("text")
-               self.news[-1].update({"text": text + data.strip()})
-               return
-        if self.link:
-            self.news[-1].update({"text": data.strip() + self.supplement})
-            return
-        if self.inDay and not self.ulInDay:
-            data = data.strip()
+    today_news = []
+    soup = BeautifulSoup(response.text, 'html.parser')
+    # <div class="box_info_cnt">
+    box_info_cnt = soup.find('div', {'class': 'box_info_cnt'})
+    # <div class="box_info_cnt"><ul>
+    for index,tag in enumerate(box_info_cnt.find_all('ul')):
+        # if index is even then date
+        if (index % 2 == 0):
+            data = tag.find('li').text.strip()
             tokyo_tz = timezone(timedelta(hours=+9))
             currentTime = datetime.now(tokyo_tz)
             if data:
@@ -120,20 +59,22 @@ def handle_data(self, data):
                     year = currentTime.year
                     if int(month) == 12 and currentTime.month == 1:
                         year = year - 1
-                    self.currentDate = "{}/{}/{}".format(year,month.zfill(2),day.zfill(2))
+                    currentDate = "{}/{}/{}".format(year,month.zfill(2),day.zfill(2))
                 else:
                     m = re.match(r'([0-9]{4})年([0-9]{1,2})月([0-9]{1,2})日', data)
                     year, month, day = m.groups()
-                    self.currentDate = "{}/{}/{}".format(year, month.zfill(2),day.zfill(2))
-            return
-
-def main():
-    response = requests.get('https://www.city.hamamatsu.shizuoka.jp/koho2/emergency/korona.html')
-    response.encoding = response.apparent_encoding
-    parser = NewsParser()
-    parser.feed(response.text)
-    parser.close()
+                    currentDate = "{}/{}/{}".format(year, month.zfill(2),day.zfill(2))
+        # if index is odd then news
+        else:
+            for news in tag.find_all('li'):
+                href = news.find('a')
+                if href:
+                    data = news.text.strip().rstrip("／")
+                    today_news.append({"date": currentDate,"url":BASE_URL + href.get('href'),"text": data})
+                else:
+                    data = news.text.strip().rstrip("／")
+                    today_news.append({"date": currentDate,"url":"","text": data})
 
-    print(json.dumps({"newsItems": parser.news}, indent=2, ensure_ascii=False))
+    print(json.dumps({"newsItems": today_news}, indent=2, ensure_ascii=False))
 if __name__ == '__main__':
     main()
diff --git a/tool/requirements.txt b/tool/requirements.txt
@@ -0,0 +1,2 @@
+beautifulsoup4==4.10.0
+requests==2.26.0