Studio KimHippo :D
#3. Kim_crawl_MK.2.1 (김_크롤_2.1호) 본문
sub_kim_crawl에 필요한 패키지
# -*- coding : utf-8 -*-
from Kim_crawl_mk1 import crawl as kcc
from time import sleep
import os
sub_kim_crawl
class sub_crawl:
def __init__(self):
pass
def none(self, in_obj):
if (in_obj == 'None') or (in_obj == 'none'):
in_obj = None
return in_obj
def get_parameters(self):
print('\n ================================================================= \n')
file_name = input('Input your file name : ')
sleep(0.5)
i_url = input('Input Your url : ')
sleep(0.5)
i_tag = input('Input your html tag : ')
sleep(0.5)
i_val = input('Input your html value : ')
sleep(0.5)
i_class = input('Input your class (None or class name): ')
sleep(0.5)
i_func = input('Input your function (None or All or all): ')
return file_name, i_url, i_tag, i_val, i_class, i_func
def one_page_text(self):
file_name, i_url, tag, val, cla, func = self.get_parameters()
craw = kcc(i_url)
if not os.path.isdir('text_data'):
os.mkdir('text_data')
os.chdir('text_data')
string = ''
cla = self.none(cla)
func = self.none(func)
if func != None:
data = craw.get_obj(tag, val, cla, func)
for rep in data:
string += rep.text + '\n'
elif func == None:
string = craw.get_obj(tag, val, cla, func).text
with open(file_name + '.txt', 'a') as data:
data.write(string)
return string
def some_page_text(self):
file_name, i_url, tag, val, cla, func = self.get_parameters()
s_page = int(input('Input your first page : '))
e_page = int(input('Input your last page : '))
if not os.path.isdir('text_data'):
os.mkdir('text_data')
os.chdir('text_data')
for o_rep in range(s_page, e_page + 1):
page = '?&page='+str(o_rep)
craw = kcc(i_url+page)
string = ''
cla = self.none(cla)
func = self.none(func)
if func != None:
data = craw.get_obj(tag, val, cla, func)
for rep in data:
string += rep.text + '\n'
elif func == None:
string = craw.get_obj(tag, val, cla, func).text
with open(file_name + '.txt', 'a') as data:
data.write(string)
if o_rep % 20 == 0:
sleep(3)
print('============================================= [ Now crawling page : ' + str(o_rep) + ' ] =============================================', end='\n')
return string
def complete(self):
print('\n\n\n')
print(
'=====================================[ Crawling Complete! ]=====================================')
print(
'=====================================[ File Saved! ]=====================================')
sub_kim_crawl 실험
if __name__ == '__main__':
craw = sub_crawl()
data = craw.some_page_text()
craw.complete()
Kim_Crawl_MK.2.1에 필요한 패키지
from urllib.request import urlretrieve as urlr
from Kim_crawl_mk1 import crawl as kcc
from sub_kim_crawl import sub_crawl as scc
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from pprint import pprint
import os, re, sys, time
import winsound as ws
Kim_Crawl_MK.2.1
try:
class kim_crawl_class_mk2:
def __init__(self):
self.sub = scc()
self.pwd = os.getcwd()
def main(self):
print('\n')
print(
'================================= [Hello, My name is Kim Crawl] =================================')
print(
'================================= [ How can I help you? ] =================================')
print('\n')
num = input('[1]. Only one page Crawling \t\t\t [2]. Some pages Crawling \n'
'[3]. Confirm your directory \t\t\t [!]. Quit Program \t\t\t \n'
'[?]. Select your function : ')
if num == '1':
self.mk_dir()
print('\n')
print(
'==========================================================================================================================')
print('\n')
num_2 = input('[1]. Text data \t\t\t [2]. Image data \n'
'[!]. Back \n'
'[?]. What do you need kind of data? : ')
if num_2 == '1':
self.sub.one_page_text()
self.sub.complete()
elif num_2 == '2':
self.img_crawl()
elif num_2 == '!':
self.main()
os.chdir(self.pwd)
else:
print('You must select another option')
self.main()
elif num == '2':
self.mk_dir()
print('\n')
print('==========================================================================================================================')
print('\n')
num_2 = input('[1]. Text data \t\t\t [2]. Image data \n'
'[!]. Back \n'
'[?]. What do you need kind of data? : ')
if num_2 == '1':
self.sub.some_page_text()
self.sub.complete()
elif num_2 == '2':
self.img_crawl()
elif num_2 == '!':
self.main()
os.chdir(self.pwd)
else:
print('You must select another option')
self.main()
elif num == '3':
self.confirm_dir()
self.main()
elif num == '!':
print('\n')
are = input('[1].Yes \t\t\t [2]. No \n'
'[?]. Are you sure? : ')
if are == '1':
print('Good bye! \n')
sys.exit()
elif are == '2':
self.main()
else:
print('You must select another option')
self.main()
else:
print('You must select another option. ')
self.main()
def img_crawl(self):
print('This function not ready yet... :(')
if not os.path.isdir('img_data'):
os.mkdir('img_data')
os.chdir('img_data')
def is_continue(self):
cond = True
while cond:
print('\n\n')
i_continue = input('[1]. Continue \t\t\t [!].Quit \n'
'[?]. Are you continue crawling ? : ')
if i_continue == '1':
self.main()
elif i_continue == '!':
print('\n')
are = input('[1].Yes \t\t\t [2]. No \n'
'[?]. Are you sure? : ')
if are == '1':
print('Good bye! \n')
sys.exit()
elif are == '2':
self.main()
else:
print('You must select another option')
self.main()
else:
print('You must select another option. ')
self.main()
def mk_dir(self):
if not os.path.isdir('data'):
os.mkdir('data')
os.chdir('data')
def confirm_dir(self):
dir = os.getcwd()
print(dir)
if __name__ == '__main__':
pwd = os.getcwd()
craw = kim_crawl_class_mk2()
craw.main()
time.sleep(2)
os.chdir(pwd)
craw.is_continue()
except NameError as e:
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! [ Alert! Error Occurred! ]!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ')
print('Error Name : ', e)
for rep in range(1, 6):
ws.Beep(1500, 2000)
time.sleep(5)
[1] 크롤링 파트만 따로 만든 sub_kim_crawl까지 만들었는데 여전히 메인 클래스 부분의 코드가 굉장히 난잡하다.
[2] 김 크롤 2호와 달라진 점이라고 하면 한정적이지만 여러 페이지 크롤링이 가능해졌고,
오류가 발생했을때 프로그램의 대처 방법이 달라졌다.
'Data Science > Crawler' 카테고리의 다른 글
#4. Kim_crawl_MK.1.1 (김_크롤_1.1호) (0) | 2019.07.09 |
---|---|
#2. Kim_crawl_MK.2 (김_크롤_2호) (0) | 2019.06.28 |
#1. Kim_crawl_MK.1 (김_크롤_1호) (0) | 2019.06.27 |
Comments