크롤링 중 list index out of range 에러 도움 부탁드립니다

22.03.30 13:43 작성 조회수 376

0

제가 작성한 코드는 아닙니다

크롤링 하는 와중에 list index of range 에러가 나오는데 해결법을 못 찾아서 질문드립니다..

 

 

 

 

from urllib.request import urlopen

from bs4 import BeautifulSoup

from xml.dom.pulldom import END_DOCUMENT

import pandas as pd

import requests

from bs4 import  BeautifulSoup

from datetime import datetime

import re

from tqdm import tqdm

from tqdm.contrib.concurrent import process_map

import math

from time import sleep

from multiprocessing.dummy import Pool

import multiprocessing as mp

from multiprocessing.pool import MaybeEncodingError

 

 

 

 

start_date = "y1=2019&m1=09&d1=25"

end_date = "y2=2019&m2=09&d2=30"

url = "https://find.mk.co.kr/new/search.php?pageNum={}&cat=&cat1=&media_eco=&pageSize=10&sub=all&dispFlag=OFF&page=news&s_kwd=%BB%EF%BC%BA%C0%FC%C0%DA&s_page=news&go_page=&ord=1&ord1=1&ord2=0&s_keyword=%BB%EF%BC%BA%C0%FC%C0%DA&period=p_direct&s_i_keyword=%BB%EF%BC%BA%C0%FC%C0%DA&s_author=&{}&{}&ord=1&area=ttbd"

 

def get_list(idx) :

 

    #idx = 검색했을때 page 번호

    req = requests.get(url.format(idx, start_date, end_date))

 

    #한글깨져서 인코딩 

    soup =  BeautifulSoup(req.content.decode('euc-kr','replace'), 'html.parser')

    div_list = soup.find_all('div', {'class' : 'sub_list'})

    art_list = [i.find('span', {'class': 'art_tit'}) for i in div_list]

 

    #db에 저장할거 title, href, body, date

    df = pd.DataFrame(columns = {'title','href', 'date','body'})

    for article in art_list:

        append_flag = True

 

 

        title = str(article.find("a").contents[0])

        href = str(article.find("a")["href"])

        body_text = None

        date = None

        try:

            req = requests.get(href, timeout=2)

        except requests.exceptions.Timeout as errd:

            print("Timeout Error : ", errd)

        except requests.exceptions.ConnectionError as errc:

            print("Error Connecting : ", errc)

 

        except requests.exceptions.HTTPError as errb:

            print("Http Error : ", errb)

        # Any Error except upper exception

        except requests.exceptions.RequestException as erra:

            print("AnyException : ", erra)

 

        try:

            soup =  BeautifulSoup(req.content.decode('euc-kr','replace'), 'html.parser')

        except:

            print("parser error")

 

 

        date_text = soup.find('li', {'class' : 'lasttime'})

        if not date_text :

            date_text = soup.find('li', {'class' : 'lasttime1'})

        if date_text :

            match = re.search(r'\d{4}.\d{2}.\d{2}', date_text.string)

            if match :

                date = datetime.strptime(match.group(), '%Y.%m.%d').date()

            else :

                print("match none")

        else :

            append_flag = False

            #print("mssing date text")

 

 

        art_text = soup.find('div', {'class' : 'art_txt'})  

        if not art_text :

            art_text = soup.find('div', {'class' : 'article_body'}) 

        if not art_text :

            art_text = soup.find('div', {'class' : 'view_txt'}) 

        if art_text :

            body_text = art_text.get_text()

        else :

            append_flag = False

            #print("mssing body text")

            #print("link : " + href)

 

        if append_flag : 

            temp = pd.DataFrame({'title' : [ title ], 'href' : [ href ], 'date' : [ date ], 'body' : [body_text]})  

            df = df.append(temp)

 

    return df

 

def get_count() :

    req = requests.get(url.format(1, start_date, end_date))

    #한글깨져서 인코딩

    soup =  BeautifulSoup(req.content.decode('euc-kr','replace'), 'html.parser')

    count_text = soup.find('span', {'class' : 'class_tit'}).get_text().replace(",","")

    art_count = re.search("\d+",count_text)

    "y1=2019&m1=03&d1=15"

 

    print(start_date[3:7]+"년 "+start_date[11:13]+"월 "+start_date[17:]+"일 부터 "

    +end_date[3:7]+"년 "+end_date[11:13]+"월 "+end_date[17:]+"일 까지 총 "

    +art_count.group(0)+"개의 기사")

 

    return art_count.group(0)

 

if __name__ == "__main__":

 

    count = get_count()

    tasks_count = math.ceil(float(count)/20) + 1

 

    #tasks = range(1,10)

    tasks = range(1,tasks_count)

    result_list = process_map(get_list, tasks,max_workers=4)

    df = pd.concat(result_list)

    #df = pd.concat(parmap.map(get_list, tasks, pm_pbar = True, pm_processes = 4))

 

    print(df)     

    file_name = start_date[5:7]+start_date[11:13]+start_date[17:]+"_"+end_date[5:7]+end_date[11:13]+end_date[17:]

    df.to_csv(file_name+'.csv', index = False, encoding='utf-8-sig')

 

------------------------------------------------------------------------------------------------------------------------------------------------------

코드는 이렇구요

 

_RemoteTraceback                          Traceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py", line 175, in _process_worker
    r = call_item.fn(*call_item.args, **call_item.kwargs)
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py", line 153, in _process_chunk
    return [fn(*args) for args in chunk]
  File "/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py", line 153, in <listcomp>
    return [fn(*args) for args in chunk]
  File "<ipython-input-7-167ab35f9166>", line 22, in get_list
    title = str(article.find("a").contents[0])
IndexError: list index out of range
"""

The above exception was the direct cause of the following exception:

IndexError                                Traceback (most recent call last)
<ipython-input-7-167ab35f9166> in <module>()
     96   #tasks = range(1,10)
     97   tasks = range(1,tasks_count)
---> 98   result_list = process_map(get_list, tasks,max_workers=4)
     99   df = pd.concat(result_list)
    100   #df = pd.concat(parmap.map(get_list, tasks, pm_pbar = True, pm_processes = 4))

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/contrib/concurrent.py in process_map(fn, *iterables, **tqdm_kwargs)
    128         tqdm_kwargs = tqdm_kwargs.copy()
    129         tqdm_kwargs["lock_name"] = "mp_lock"
--> 130     return _executor_map(ProcessPoolExecutor, fn, *iterables, **tqdm_kwargs)

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/contrib/concurrent.py in _executor_map(PoolExecutor, fn, *iterables, **tqdm_kwargs)
     74             map_args.update(chunksize=chunksize)
     75         with PoolExecutor(**pool_kwargs) as ex:
---> 76             return list(tqdm_class(ex.map(fn, *iterables, **map_args), **kwargs))
     77 
     78 

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/notebook.py in __iter__(self)
    255     def __iter__(self):
    256         try:
--> 257             for obj in super(tqdm_notebook, self).__iter__():
    258                 # return super(tqdm...) will not catch exception
    259                 yield obj

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/site-packages/tqdm/std.py in __iter__(self)
   1183 
   1184         try:
-> 1185             for obj in iterable:
   1186                 yield obj
   1187                 # Update and possibly print the progressbar.

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/process.py in _chain_from_iterable_of_lists(iterable)
    364     careful not to keep references to yielded objects.
    365     """
--> 366     for element in iterable:
    367         element.reverse()
    368         while element:

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/_base.py in result_iterator()
    584                     # Careful not to keep a reference to the popped future
    585                     if timeout is None:
--> 586                         yield fs.pop().result()
    587                     else:
    588                         yield fs.pop().result(end_time - time.monotonic())

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/_base.py in result(self, timeout)
    430                 raise CancelledError()
    431             elif self._state == FINISHED:
--> 432                 return self.__get_result()
    433             else:
    434                 raise TimeoutError()

/home/ubuntu/anaconda3/envs/python3/lib/python3.6/concurrent/futures/_base.py in __get_result(self)
    382     def __get_result(self):
    383         if self._exception:
--> 384             raise self._exception
    385         else:
    386             return self._result


---------------------------------------------------------------------------------------------------------------------

이렇게 에러가 뜹니다





title = str(article.find("a").contents[0]) 이 부분에서
contents가 존재하지 않는데 인덱스로 접근하려고 해서 오류가 난 것 같은데
contents가 무조건 존재 하는게 아니라면 존재하지 않는 경우의 예외처리를 추가하려면 어떻게 해야될까요?
어느 위치에 뭐라고 작성해야 할지 몰라서 막막해서 질문드립니다

답변 0

답변을 작성해보세요.

답변을 기다리고 있는 질문이에요.
첫번째 답변을 남겨보세요!