From 1c8294925207f192e8b12569f54e1456fa9b5d02 Mon Sep 17 00:00:00 2001 From: euledge Date: Sun, 7 Aug 2022 22:01:19 +0900 Subject: [PATCH] =?UTF-8?q?beatifulsoup=E3=82=92=E4=BD=BF=E7=94=A8?= =?UTF-8?q?=E3=81=97=E3=81=A6=E6=96=B0=E3=81=97=E3=81=84=E6=A7=8B=E9=80=A0?= =?UTF-8?q?=E3=81=AE=E3=82=B9=E3=82=AF=E3=83=AC=E3=82=A4=E3=83=94=E3=83=B3?= =?UTF-8?q?=E3=82=B0=E3=82=92=E8=A1=8C=E3=81=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/create_data_news_json.yml | 1 + tool/create_news_json.py | 113 +++++--------------- tool/requirements.txt | 2 + 3 files changed, 30 insertions(+), 86 deletions(-) create mode 100644 tool/requirements.txt diff --git a/.github/workflows/create_data_news_json.yml b/.github/workflows/create_data_news_json.yml index d17538a4427d2..df67a99fa245d 100644 --- a/.github/workflows/create_data_news_json.yml +++ b/.github/workflows/create_data_news_json.yml @@ -19,6 +19,7 @@ jobs: continue-on-error: true id: error_on_news_json run: | + pip install -r production/tool/requirements.txt python3 -B production/tool/create_news_json.py > news.json mv news.json production/data/hamamatsu/news.json # - name: RUN PYTHON SCRIPT(create data.json) diff --git a/tool/create_news_json.py b/tool/create_news_json.py index 1540f6b73435c..26396533a915f 100644 --- a/tool/create_news_json.py +++ b/tool/create_news_json.py @@ -7,6 +7,7 @@ from datetime import timezone from datetime import timedelta from html.parser import HTMLParser +from bs4 import BeautifulSoup ''' 浜松市コロナサイトのお知らせ部分からnews.jsonを作成する @@ -34,83 +35,21 @@ } ''' -class NewsParser(HTMLParser): - def __init__(self): - HTMLParser.__init__(self) - self.BASE_URL = 'https://www.city.hamamatsu.shizuoka.jp' - self.inContents = False - self.inDay = False - self.ulInDay = False - self.listInDay = False - self.link = False - self.news = [] - self.currentDate = '' - self.supplement = '' - self.starttag = '' - self.endtag = '' - - def handle_starttag(self, tag, attrs): - attrs = dict(attrs) - self.starttag = tag - #
- if tag == "div" and "class" in attrs and attrs['class'] == "box_info_cnt": - self.inContents = True - return - #
  • x月y日 - if tag == "li" and self.inContents and not self.inDay: - self.inDay = True - return - #
  • x月y日
      - if tag == "ul" and self.inDay: - self.ulInDay = True - return - #
    • x月y日
      • - if tag == "li" and self.ulInDay: - self.listInDay = True - return - #
      • x月y日
        • yyyyyyyy - if tag == "a" and self.listInDay: - self.link = True - if attrs["href"].startswith("http"): - self.news.append({"date": self.currentDate,"url": attrs["href"]}) - else: - self.news.append({"date": self.currentDate,"url": self.BASE_URL + attrs["href"]}) - return - +def main(): + BASE_URL = 'https://www.city.hamamatsu.shizuoka.jp' - def handle_endtag(self, tag): - self.endtag = tag - if tag == "a" and self.link: - self.link = False - return - if tag == "li" and self.listInDay: - self.listInDay = False - return - if tag == "ul" and self.ulInDay: - self.ulInDay = False - return - if tag == "li" and self.inDay: - self.inDay = False - return - if tag == "div" and self.inContents: - self.inContents = False - return + response = requests.get('https://www.city.hamamatsu.shizuoka.jp/koho2/emergency/korona.html') + response.encoding = response.apparent_encoding - def handle_data(self, data): - if self.listInDay and not self.link: - data = data.strip().rstrip("/") - if data and self.lasttag == 'li': - self.news.append({"date": self.currentDate,"url":"","text": data}) - return - if data: - text = self.news[-1].get("text") - self.news[-1].update({"text": text + data.strip()}) - return - if self.link: - self.news[-1].update({"text": data.strip() + self.supplement}) - return - if self.inDay and not self.ulInDay: - data = data.strip() + today_news = [] + soup = BeautifulSoup(response.text, 'html.parser') + #
          + box_info_cnt = soup.find('div', {'class': 'box_info_cnt'}) + #
            + for index,tag in enumerate(box_info_cnt.find_all('ul')): + # if index is even then date + if (index % 2 == 0): + data = tag.find('li').text.strip() tokyo_tz = timezone(timedelta(hours=+9)) currentTime = datetime.now(tokyo_tz) if data: @@ -120,20 +59,22 @@ def handle_data(self, data): year = currentTime.year if int(month) == 12 and currentTime.month == 1: year = year - 1 - self.currentDate = "{}/{}/{}".format(year,month.zfill(2),day.zfill(2)) + currentDate = "{}/{}/{}".format(year,month.zfill(2),day.zfill(2)) else: m = re.match(r'([0-9]{4})年([0-9]{1,2})月([0-9]{1,2})日', data) year, month, day = m.groups() - self.currentDate = "{}/{}/{}".format(year, month.zfill(2),day.zfill(2)) - return - -def main(): - response = requests.get('https://www.city.hamamatsu.shizuoka.jp/koho2/emergency/korona.html') - response.encoding = response.apparent_encoding - parser = NewsParser() - parser.feed(response.text) - parser.close() + currentDate = "{}/{}/{}".format(year, month.zfill(2),day.zfill(2)) + # if index is odd then news + else: + for news in tag.find_all('li'): + href = news.find('a') + if href: + data = news.text.strip().rstrip("/") + today_news.append({"date": currentDate,"url":BASE_URL + href.get('href'),"text": data}) + else: + data = news.text.strip().rstrip("/") + today_news.append({"date": currentDate,"url":"","text": data}) - print(json.dumps({"newsItems": parser.news}, indent=2, ensure_ascii=False)) + print(json.dumps({"newsItems": today_news}, indent=2, ensure_ascii=False)) if __name__ == '__main__': main() diff --git a/tool/requirements.txt b/tool/requirements.txt new file mode 100644 index 0000000000000..6f042747b90db --- /dev/null +++ b/tool/requirements.txt @@ -0,0 +1,2 @@ +beautifulsoup4==4.10.0 +requests==2.26.0