Studio KimHippo :D
#1. Kim_crawl_MK.1 (김_크롤_1호) 본문
필요 패키지
from bs4 import BeautifulSoup as bs
from pprint import pprint
import requests as req
import os, re
클래스 부분
class crawl:
def __init__(self, in_url):
self.url = in_url
# self.driver = webdriver.Chrome('chromedriver')
# self.driver.get(in_url)
def get_parser(self):
html = req.get(self.url)
soup = bs(html.text, 'html.parser')
return soup
def get_obj(self, in_tag, in_val = None, in_key = None, func = None ):
dic = {}
soup = self.get_parser()
if (func == 'All') or (func == 'all'):
if in_key == None:
obj = soup.find_all(in_tag, in_val)
else:
dic.setdefault(in_key, in_val)
obj = soup.findAll(in_tag, dic)
elif func == None:
if in_key == None:
obj = soup.find(in_tag, in_val)
else:
dic.setdefault(in_key, in_val)
obj = soup.find(in_tag, dic)
return obj
-- 실험 --
if __name__ == '__main__':
try:
if not os.path.isdir('text_data'):
os.mkdir('text_data')
os.chdir('text_data')
for o_rep in range(1, 21):
page = '?page=' + str(o_rep)
print('====================== page : ' + str(o_rep) + '======================', end = '\n')
craw = crawl('https://land.naver.com/isale/isaleNewsTrend.nhn' + page)
data = craw.get_obj('h4', 'h_trend', None, 'all')
posts = ''
for i_rep in data:
posts += i_rep.text
posts = posts.replace('\t', '')
with open('real_2.txt', 'a') as real:
real.write(posts)
except:
pass
print('File saved!')
print('Crawling Complete!')
-- 결과 --
'Data Science > Crawler' 카테고리의 다른 글
#4. Kim_crawl_MK.1.1 (김_크롤_1.1호) (0) | 2019.07.09 |
---|---|
#3. Kim_crawl_MK.2.1 (김_크롤_2.1호) (0) | 2019.06.28 |
#2. Kim_crawl_MK.2 (김_크롤_2호) (0) | 2019.06.28 |
Comments