-Ted 강연 제목/대본 크롤링- 20.07.19 ~ 20.07.21
class_define.py
class Topic:
def __init__(self, name) -> None:
self.name = name
self.speech = []
def add(self, speech):
self.speech.append(speech)
def __str__(self):
return self.name
class Speech:
def __init__(self, title, speaker) -> None:
self.title = title
self.speaker = speaker
self.script = None
def __str__(self):
return self.title
selenium_class.py
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument(
"user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
def __call__(self):
return self.driver
def get_url(self, url): # 새 창으로 url 열기
self.driver.get(url)
def get_default(self):
while True:
try:
self.driver.switch_to_default_content()
return
except:
print('default frame 이동')
pass
def get_fra(self, name):
while True:
try:
self.driver.switch_to_frame(name)
break
except:
self.get_default()
print(name, 'frame 이동')
continue
def get_top(self):
while True:
try:
self.driver.switch_to_frame('top')
break
except:
self.get_fra('body')
print('body', 'frame 이동')
continue
def find_by_xpath(self, xpath): # Xpath로 단일 요소 찾기
return WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located(
(By.XPATH, xpath)))
def find_by_class(self, class_name): # class name으로 단일 요소 찾기
return WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located(
(By.CLASS_NAME, class_name)))
def find_by_tag(self, tag): # tag로 단일 요소 찾기
return WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located(
(By.TAG_NAME, tag)))
def find_by_name(self, name): # name으로 단일 요소 찾기
return WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located(
(By.NAME, name)))
def find_all_by_class(self, class_name): # class name으로 모든 요소 찾기
return WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located(
(By.TAG_NAME, class_name)))
def find_all_by_tag(self, tag): # tag로 모든 요소 찾기
return WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located(
(By.TAG_NAME, tag)))
def find_all_by_name(self, name): # name으로 모든 요소 찾기
return WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located(
(By.NAME, name)))
def find_all_by_tag_with_obj(self, obj, name): # name으로 모든 요소 찾기
return WebDriverWait(obj, 20).until(
EC.presence_of_all_elements_located(
(By.TAG_NAME, name)))
def find_by_tag_with_obj(self, obj, name): # name으로 요소 찾기
return WebDriverWait(obj, 20).until(
EC.presence_of_element_located(
(By.TAG_NAME, name)))
def find_by_link(self, text):
return WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located(
(By.LINK_TEXT, text)))
def click(self, btn):
self.driver.execute_script("arguments[0].click();", btn)
result = []
driver = Driver()
for t in topics:
t_ = t.replace('"', '""')
# 토픽 객체 생성
topic_obj = Topic(t)
while True:
raw = requests.get("<https://www.ted.com/talks?topics%5B%5D=>" + t)
html = BeautifulSoup(raw.text, 'html.parser')
try:
max_pagination = html.select('a.pagination__item')[-1]
break
except:
continue
for n in range(1, int(max_pagination.text) + 1):
raw = requests.get("<https://www.ted.com/talks?page=>" + str(n) + "&topics%5B%5D=" + t)
html = BeautifulSoup(raw.text, 'html.parser')
# 1. 컨테이너 수집
container = html.select("div.talk-link")
# 2. 영상데이터 수집 (제목 & 발표자)
for cont in container:
Title = cont.select_one("h4 a") # 영상제목
Speaker = cont.select_one("h4.h12") # 영상발표자
title = Title.text.strip()
speaker = Speaker.text.strip()
# 강연 객체 생성
speech_obj = Speech(title.replace('"', '""'), speaker)
print(title, speaker, t_)
# title 및 speaker 전처리
title = "_".join(title.split())
title = re.sub("[^a-zA-Z0-9]", "_", title.lower())
title = re.sub("_{2,}", "_", title.lower())
if title[-1] == "_":
title = title[:-1]
speaker = re.sub("[^a-zA-Z0-9]", "_", speaker.lower())
# 대본 페이지 접속
url = "<https://www.ted.com/talks/>" + speaker + "_" + title + "/transcript"
x = requests.get(url)
## 404 에러 회피
if x.status_code == 404:
continue
while x.status_code == 429 or x.status_code == 500:
url = "<https://www.ted.com/talks/>" + speaker + "_" + title + "/transcript"
x = requests.get(url)
# 대본 준비
script = ''
html = BeautifulSoup(x.text, 'html.parser')
container_1 = html.select("div.Grid__cell.flx-s\\:1")
for k in container_1:
k = k.select('p')
script += k[0].text.strip().replace('\\n', ' ').replace(
' ', '') + ' '
if script == '':
driver.get_url(url)
if '404' in driver.title:
continue
div = driver.find_all_by_class('div.Grid__cell.flx-s\\:1')
for i in div:
script += div.text.strip()
speech_obj.script = script.replace('"', '""')
# 토픽에 강연 추가
topic_obj.add(speech_obj)
result.append(topic_obj)
return result