Studio KimHippo :D

#3. Kim_crawl_MK.2.1 (김_크롤_2.1호) 본문

Data Science/Crawler

#3. Kim_crawl_MK.2.1 (김_크롤_2.1호)

김작은하마 2019. 6. 28. 02:14

sub_kim_crawl에 필요한 패키지

# -*- coding : utf-8 -*-

from Kim_crawl_mk1 import crawl as kcc
from time import sleep
import os

sub_kim_crawl

class sub_crawl:

    def __init__(self):
        pass

    def none(self, in_obj):
            if (in_obj == 'None') or (in_obj == 'none'):
                in_obj = None
            return in_obj

    def get_parameters(self):

        print('\n ================================================================= \n')
        file_name = input('Input your file name : ')
        sleep(0.5)

        i_url = input('Input Your url : ')
        sleep(0.5)

        i_tag = input('Input your html tag : ')
        sleep(0.5)
        i_val = input('Input your html value : ')
        sleep(0.5)
        i_class = input('Input your class (None or class name): ')
        sleep(0.5)
        i_func = input('Input your function (None or All or all): ')

        return  file_name, i_url, i_tag, i_val, i_class, i_func

    def one_page_text(self):

        file_name, i_url, tag, val, cla, func = self.get_parameters()
        craw = kcc(i_url)

        if not os.path.isdir('text_data'):
            os.mkdir('text_data')
        os.chdir('text_data')

        string = ''
        cla = self.none(cla)
        func = self.none(func)

        if func != None:
            data = craw.get_obj(tag, val, cla, func)
            for rep in data:
                string += rep.text + '\n'

        elif func == None:
            string = craw.get_obj(tag, val, cla, func).text

        with open(file_name + '.txt', 'a') as data:
            data.write(string)

        return string

    def some_page_text(self):

        file_name, i_url, tag, val, cla, func = self.get_parameters()
        s_page = int(input('Input your first page : '))
        e_page = int(input('Input your last page : '))

        if not os.path.isdir('text_data'):
            os.mkdir('text_data')
        os.chdir('text_data')

        for o_rep in range(s_page, e_page + 1):

            page = '?&page='+str(o_rep)
            craw = kcc(i_url+page)

            string = ''
            cla = self.none(cla)
            func = self.none(func)

            if func != None:
                data = craw.get_obj(tag, val, cla, func)
                for rep in data:
                    string += rep.text + '\n'

            elif func == None:
                string = craw.get_obj(tag, val, cla, func).text

            with open(file_name + '.txt', 'a') as data:
                data.write(string)

            if o_rep % 20 == 0:
                sleep(3)

            print('============================================= [ Now crawling page : ' + str(o_rep) + ' ] =============================================', end='\n')


        return string

    def complete(self):

        print('\n\n\n')
        print(
            '=====================================[ Crawling Complete! ]=====================================')
        print(
            '=====================================[     File Saved!    ]=====================================')

sub_kim_crawl 실험

if __name__ == '__main__':
    craw = sub_crawl()
    data = craw.some_page_text()
    craw.complete()

 

Kim_Crawl_MK.2.1에 필요한 패키지

    from urllib.request import urlretrieve as urlr
    from Kim_crawl_mk1 import crawl as kcc
    from sub_kim_crawl import sub_crawl as scc
    from bs4 import BeautifulSoup as bs
    from selenium import webdriver
    from pprint import pprint
    import os, re, sys, time
    import winsound as ws

Kim_Crawl_MK.2.1

try:
 class kim_crawl_class_mk2:

        def __init__(self):
            self.sub = scc()
            self.pwd = os.getcwd()

        def main(self):

            print('\n')
            print(
                '================================= [Hello, My name is Kim Crawl] =================================')
            print(
                '================================= [    How can I help you?    ] =================================')
            print('\n')
            num = input('[1]. Only one page Crawling \t\t\t [2]. Some pages Crawling \n'
                        '[3]. Confirm your directory \t\t\t [!]. Quit Program \t\t\t \n'
                        '[?]. Select your function : ')

            if num == '1':
                self.mk_dir()

                print('\n')
                print(
                    '==========================================================================================================================')
                print('\n')

                num_2 = input('[1]. Text data \t\t\t [2]. Image data \n'
                              '[!]. Back \n'
                              '[?]. What do you need kind of data? : ')
                if num_2 == '1':
                    self.sub.one_page_text()
                    self.sub.complete()

                elif num_2 == '2':
                    self.img_crawl()
                elif num_2 == '!':
                    self.main()
                    os.chdir(self.pwd)
                else:
                    print('You must select another option')
                    self.main()

            elif num == '2':

                self.mk_dir()
                print('\n')
                print('==========================================================================================================================')
                print('\n')

                num_2 = input('[1]. Text data \t\t\t [2]. Image data \n'
                              '[!]. Back \n'
                              '[?]. What do you need kind of data? : ')
                if num_2 == '1':
                    self.sub.some_page_text()
                    self.sub.complete()

                elif num_2 == '2':
                    self.img_crawl()

                elif num_2 == '!':
                    self.main()
                    os.chdir(self.pwd)
                else:
                    print('You must select another option')
                    self.main()

            elif num == '3':
                self.confirm_dir()
                self.main()

            elif num == '!':
                print('\n')
                are = input('[1].Yes \t\t\t [2]. No \n'
                            '[?]. Are you sure? : ')
                if are == '1':
                    print('Good bye! \n')
                    sys.exit()
                elif are == '2':
                    self.main()
                else:
                    print('You must select another option')
                    self.main()

            else:
                print('You must select another option. ')
                self.main()

        def img_crawl(self):
            print('This function not ready yet... :(')
            if not os.path.isdir('img_data'):
                os.mkdir('img_data')
            os.chdir('img_data')

        def is_continue(self):

            cond = True
            while cond:

                print('\n\n')
                i_continue = input('[1]. Continue \t\t\t [!].Quit \n'
                                   '[?]. Are you continue crawling ? : ')

                if i_continue == '1':
                    self.main()

                elif i_continue == '!':
                    print('\n')
                    are = input('[1].Yes \t\t\t [2]. No \n'
                                '[?]. Are you sure? : ')
                    if are == '1':
                        print('Good bye! \n')
                        sys.exit()
                    elif are == '2':
                        self.main()
                    else:
                        print('You must select another option')
                        self.main()

                else:
                    print('You must select another option. ')
                    self.main()

        def mk_dir(self):
            if not os.path.isdir('data'):
                os.mkdir('data')
            os.chdir('data')

        def confirm_dir(self):
            dir = os.getcwd()
            print(dir)

    if __name__ == '__main__':
        pwd = os.getcwd()
        craw = kim_crawl_class_mk2()
        craw.main()
        time.sleep(2)
        os.chdir(pwd)
        craw.is_continue()

except NameError as e:
    print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! [ Alert! Error Occurred! ]!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ')
    print('Error Name : ', e)

    for rep in range(1, 6):
        ws.Beep(1500, 2000)
    time.sleep(5)

[1] 크롤링 파트만 따로 만든 sub_kim_crawl까지 만들었는데 여전히 메인 클래스 부분의 코드가 굉장히 난잡하다.

[2] 김 크롤 2호와 달라진 점이라고 하면 한정적이지만 여러 페이지 크롤링이 가능해졌고,

    오류가 발생했을때 프로그램의 대처 방법이 달라졌다.

'Data Science > Crawler' 카테고리의 다른 글

#4. Kim_crawl_MK.1.1 (김_크롤_1.1호)  (0) 2019.07.09
#2. Kim_crawl_MK.2 (김_크롤_2호)  (0) 2019.06.28
#1. Kim_crawl_MK.1 (김_크롤_1호)  (0) 2019.06.27
Comments