Skip to content
This repository has been archived by the owner on May 9, 2023. It is now read-only.

WIP:beatifulsoupを使用して新しい構造のスクレイピングを行う #1548

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/create_data_news_json.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ jobs:
continue-on-error: true
id: error_on_news_json
run: |
pip install -r production/tool/requirements.txt
python3 -B production/tool/create_news_json.py > news.json
mv news.json production/data/hamamatsu/news.json
# - name: RUN PYTHON SCRIPT(create data.json)
Expand Down
113 changes: 27 additions & 86 deletions tool/create_news_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from datetime import timezone
from datetime import timedelta
from html.parser import HTMLParser
from bs4 import BeautifulSoup

'''
浜松市コロナサイトのお知らせ部分からnews.jsonを作成する
Expand Down Expand Up @@ -34,83 +35,21 @@
}
'''

class NewsParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.BASE_URL = 'https://www.city.hamamatsu.shizuoka.jp'
self.inContents = False
self.inDay = False
self.ulInDay = False
self.listInDay = False
self.link = False
self.news = []
self.currentDate = ''
self.supplement = ''
self.starttag = ''
self.endtag = ''

def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
self.starttag = tag
# <div class="box_info_cnt">
if tag == "div" and "class" in attrs and attrs['class'] == "box_info_cnt":
self.inContents = True
return
# <li>x月y日
if tag == "li" and self.inContents and not self.inDay:
self.inDay = True
return
# <li>x月y日<ul>
if tag == "ul" and self.inDay:
self.ulInDay = True
return
# <li>x月y日<ul><li>
if tag == "li" and self.ulInDay:
self.listInDay = True
return
# <li>x月y日<ul><li><a href="xxxx.html">yyyyyyyy</a>
if tag == "a" and self.listInDay:
self.link = True
if attrs["href"].startswith("http"):
self.news.append({"date": self.currentDate,"url": attrs["href"]})
else:
self.news.append({"date": self.currentDate,"url": self.BASE_URL + attrs["href"]})
return

def main():
BASE_URL = 'https://www.city.hamamatsu.shizuoka.jp'

def handle_endtag(self, tag):
self.endtag = tag
if tag == "a" and self.link:
self.link = False
return
if tag == "li" and self.listInDay:
self.listInDay = False
return
if tag == "ul" and self.ulInDay:
self.ulInDay = False
return
if tag == "li" and self.inDay:
self.inDay = False
return
if tag == "div" and self.inContents:
self.inContents = False
return
response = requests.get('https://www.city.hamamatsu.shizuoka.jp/koho2/emergency/korona.html')
response.encoding = response.apparent_encoding

def handle_data(self, data):
if self.listInDay and not self.link:
data = data.strip().rstrip("/")
if data and self.lasttag == 'li':
self.news.append({"date": self.currentDate,"url":"","text": data})
return
if data:
text = self.news[-1].get("text")
self.news[-1].update({"text": text + data.strip()})
return
if self.link:
self.news[-1].update({"text": data.strip() + self.supplement})
return
if self.inDay and not self.ulInDay:
data = data.strip()
today_news = []
soup = BeautifulSoup(response.text, 'html.parser')
# <div class="box_info_cnt">
box_info_cnt = soup.find('div', {'class': 'box_info_cnt'})
# <div class="box_info_cnt"><ul>
for index,tag in enumerate(box_info_cnt.find_all('ul')):
# if index is even then date
if (index % 2 == 0):
data = tag.find('li').text.strip()
tokyo_tz = timezone(timedelta(hours=+9))
currentTime = datetime.now(tokyo_tz)
if data:
Expand All @@ -120,20 +59,22 @@ def handle_data(self, data):
year = currentTime.year
if int(month) == 12 and currentTime.month == 1:
year = year - 1
self.currentDate = "{}/{}/{}".format(year,month.zfill(2),day.zfill(2))
currentDate = "{}/{}/{}".format(year,month.zfill(2),day.zfill(2))
else:
m = re.match(r'([0-9]{4})年([0-9]{1,2})月([0-9]{1,2})日', data)
year, month, day = m.groups()
self.currentDate = "{}/{}/{}".format(year, month.zfill(2),day.zfill(2))
return

def main():
response = requests.get('https://www.city.hamamatsu.shizuoka.jp/koho2/emergency/korona.html')
response.encoding = response.apparent_encoding
parser = NewsParser()
parser.feed(response.text)
parser.close()
currentDate = "{}/{}/{}".format(year, month.zfill(2),day.zfill(2))
# if index is odd then news
else:
for news in tag.find_all('li'):
href = news.find('a')
if href:
data = news.text.strip().rstrip("/")
today_news.append({"date": currentDate,"url":BASE_URL + href.get('href'),"text": data})
else:
data = news.text.strip().rstrip("/")
today_news.append({"date": currentDate,"url":"","text": data})

print(json.dumps({"newsItems": parser.news}, indent=2, ensure_ascii=False))
print(json.dumps({"newsItems": today_news}, indent=2, ensure_ascii=False))
if __name__ == '__main__':
main()
2 changes: 2 additions & 0 deletions tool/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
beautifulsoup4==4.10.0
requests==2.26.0