Studio KimHippo :D

#1. Kim_crawl_MK.1 (김_크롤_1호) 본문

Data Science/Crawler

#1. Kim_crawl_MK.1 (김_크롤_1호)

김작은하마 2019. 6. 27. 22:31

필요 패키지

from bs4 import BeautifulSoup as bs
from pprint import pprint
import requests as req
import os, re

클래스 부분

class crawl:

    def __init__(self, in_url):
        self.url = in_url
        # self.driver = webdriver.Chrome('chromedriver')
        # self.driver.get(in_url)

    def get_parser(self):

        html = req.get(self.url)
        soup = bs(html.text, 'html.parser')
        return soup

    def get_obj(self, in_tag, in_val = None, in_key = None, func = None ):

        dic = {}
        soup = self.get_parser()

        if (func == 'All') or (func == 'all'):
            if in_key == None:
                obj = soup.find_all(in_tag, in_val)

            else:
                dic.setdefault(in_key, in_val)
                obj = soup.findAll(in_tag, dic)

        elif func == None:
            if in_key == None:
                obj = soup.find(in_tag, in_val)

            else:
                dic.setdefault(in_key, in_val)
                obj = soup.find(in_tag, dic)

        return obj

-- 실험 --

if __name__ == '__main__':

    try:
        if not os.path.isdir('text_data'):
            os.mkdir('text_data')
        os.chdir('text_data')

        for o_rep in range(1, 21):

            page = '?page=' + str(o_rep)
            print('====================== page : ' + str(o_rep) + '======================', end = '\n')

            craw = crawl('https://land.naver.com/isale/isaleNewsTrend.nhn' + page)
            data = craw.get_obj('h4', 'h_trend', None, 'all')

            posts = ''
            for i_rep in data:
                posts += i_rep.text

            posts = posts.replace('\t', '')
            with open('real_2.txt', 'a') as real:
                real.write(posts)

    except:
        pass

    print('File saved!')
    print('Crawling Complete!')

 

-- 결과 --

네이버 부동산 관련 페이지 1 ~ 20페이지까지 크롤링

 

 

 

크롤링한 텍스트 데이터

'Data Science > Crawler' 카테고리의 다른 글

#4. Kim_crawl_MK.1.1 (김_크롤_1.1호)  (0) 2019.07.09
#3. Kim_crawl_MK.2.1 (김_크롤_2.1호)  (0) 2019.06.28
#2. Kim_crawl_MK.2 (김_크롤_2호)  (0) 2019.06.28
Comments