인프런 커뮤니티 질문&답변
원하는 값이 없을 때
작성
·
21
·
수정됨
0
안녕하세요 강의 잘 수강하고 있습니다.
현재 subject, link, content, press, date를 추출하는데 만약 해당 값들 중 하나가 없다면 검증은 어떤 식으로 하시나요??
제가 생각한 방법은 pydantic으로 검증하여 해결할 수 있지 않을까 생각하는데 너무 오버엔지니어링인지 강사님은 어떻게 유연하게 처리하는지 궁금합니다.
예시를 들기 위해 pydantic으로 한 코드 첨부합니다.
from typing import Annotated
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlencode
from datetime import datetime
from pydantic import AfterValidator, BaseModel, HttpUrl, field_validator
def non_empty_str(v: str) -> str:
s = (v or "").strip()
if not s:
raise ValueError("빈 문자열입니다.")
return s
def valid_url(url: str) -> str:
try:
response = requests.get(url)
# throw error if status 4xx, 5xx
response.raise_for_status()
return url
except Exception as _:
raise ValueError("유효하지 않은 URL입니다.")
def valid_date(date: str) -> str:
v = (date or "").strip()
if not v:
raise ValueError("빈 문자열입니다.")
try:
datetime.strptime(v, "%Y-%m-%d %H:%M:%S")
return date
except ValueError:
raise ValueError("유효하지 않은 날짜 형식입니다.")
class NewsItem(BaseModel):
subject: Annotated[str, AfterValidator(non_empty_str)]
detail_article_url: Annotated[str, AfterValidator(valid_url)]
content: Annotated[str, AfterValidator(non_empty_str)]
press: Annotated[str, AfterValidator(non_empty_str)]
article_date: Annotated[str, AfterValidator(valid_date)]
ROOT = "https://finance.naver.com/"
PATH = "news/mainnews.naver"
def get_news_page_url(page: int, date: str):
base_url = urljoin(ROOT, PATH)
query_string = urlencode({"date": date, "page": page})
return f"{base_url}?{query_string}"
def get_detail_news_url(path):
return urljoin(ROOT, path)
def get_current_date(format: str ="%Y-%m-%d") -> str:
now = datetime.now()
return now.strftime(format)
def crawl_news_per_page(soup: BeautifulSoup):
result = []
news_per_page = soup.select(".block1")
for article_card in news_per_page:
# subject
subject = article_card.select_one(".articleSubject > a").text
# detail article link
detail_article_url = get_detail_news_url(article_card.select_one(".articleSubject > a").get("href"))
# content
content_tag = article_card.select_one(".articleSummary")
content = content_tag.contents[0].strip()
press = content_tag.select_one(".press").text.strip()
article_date = content_tag.select_one(".wdate").text.strip()
new_item = NewsItem(subject=subject, detail_article_url=detail_article_url, content=content, press=press, article_date=article_date)
result.append(
new_item.model_dump()
)
return result
def crawl_all_news(page_total_count: int, date: str):
result = []
for page in range(1, page_total_count + 1):
request_url = get_news_page_url(page, date)
response = requests.get(request_url)
soup = BeautifulSoup(response.text, "html.parser")
result.extend(crawl_news_per_page(soup))
# 마지막 페이지 검증
if (not soup.select_one(".pgRR")):
break
return result
all_news = crawl_all_news(20, get_current_date())
print(all_news)
답변
답변을 기다리고 있는 질문이에요
첫번째 답변을 남겨보세요!





