get_company_details()
: 해외 투자사 정보
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd
from elasticsearch import Elasticsearch
import boto3
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
chrome_options = Options()
chrome_options.add_argument('--headless')
print(ChromeDriverManager().install())
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
# 메인 페이지
driver.get('<https://mercury.com/investor-database#investor-table>')
driver.implicitly_wait(10)
col=["name","role","type","bio","stages","geography","checkrange","industries","aboutinvest","linkedinlink","fundlink","email"]
results=[]
# 화면 스크롤 함수 - 안씀!
def scroll_to_element(element):
driver.execute_script("arguments[0].scrollIntoView(true);", element)
# 함수
def get_company_details():
data={}
try:
# name
name_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > h1')
for element in name_elements:
data['name'] = element.text
#role
role_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div.styles__twoColumn_oi18L > div:nth-child(1) > p.styles__regularValue_qkp_Y')
for element in role_elements:
data['role'] = element.text
#type
type_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div.styles__twoColumn_oi18L > div:nth-child(2) > p.styles__regularValue_qkp_Y')
for element in type_elements:
data['type'] = element.text
#bio
bio_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(2) > p.styles__regularValue_qkp_Y')
for element in bio_elements:
data['bio'] = element.text
#stages
stages_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div.styles__twoColumn_oi18L > div:nth-child(1) > p.styles__largeValue_fWvfH')
for element in stages_elements:
data['stages'] = element.text
#geography
geography_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div.styles__twoColumn_oi18L > div:nth-child(2) > p.styles__largeValue_fWvfH')
for element in geography_elements:
data['geography'] = element.text
#checkrange
checkrange_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(2) > p.styles__hugeValue_vB1_k')
for element in checkrange_elements:
data['checkrange'] = element.text
#industries
# 1. See All 버튼 누르는 코드 작성 (js로 강제로 눌리게 한다 - 요소 겹침 문제)
button_css = '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(3) > div > button'
try:
button_element = driver.find_element(By.CSS_SELECTOR, button_css)
driver.execute_script("arguments[0].click();", button_element)
except NoSuchElementException:
print("Button element not found. Skipping...")
# 2. 가져오기..
div_count = len(driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(3) > div > div'))
print("하위 div 태그 수:", div_count)
data['industries'] = []
for i in range(1, div_count+ 1):
industries_css = f'#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(3) > div > div:nth-child({i})'
industries_elements = driver.find_elements(By.CSS_SELECTOR, industries_css)
for element in industries_elements:
data['industries'].append(element.text)
#aboutinvest
aboutinvest_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(4) > p.styles__regularValue_qkp_Y')
for element in aboutinvest_elements:
data['aboutinvest'] = element.text
#twitter
try:
twitterlink_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(3) > div > span:nth-child(1) > a')
for element in twitterlink_elements:
href=element.get_attribute('href')
data['twitterlink'] = href
except NoSuchElementException:
data['twitterlink'] = 'null'
print("Twitter element not found. Skipping...")
#linkedinlink
try:
linkedinlink_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(3) > div > div > a')
for element in linkedinlink_elements:
href=element.get_attribute('href')
data['linkedinlink'] = href
except NoSuchElementException:
data['linkedinlink']='null'
print("LinkedIn element not found. Skipping...")
#fundlink
try:
fundlink_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(3) > div > span > a')
for element in fundlink_elements:
href=element.get_attribute('href')
data['fundlink'] = href
except NoSuchElementException:
data['fundlink'] = 'null'
print("FundLink element not found. Skipping...")
#email
email_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(7) > div > div.styles__getInTouchContainer_ElM5P > div:nth-child(1) > span:nth-child(4) > a')
for element in email_elements:
href=element.get_attribute('href')
data['email'] = href
results.append(data)
print(results)
# 이전으로 돌아가는 버튼 누르는 코드 작성!
back_css='#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > span > a'
back=WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, back_css)))
back.click()
# input()
time.sleep(3)
return
except Exception as e:
print(e)
finally:
return
driver.quit()
# 1-1. 페이지 이동(전체 페이지 반복문임)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
i = 280
while True:
try:
print("******** "+str(i)+"번째 항목 *************")
css = f'#investor-table > div.styles__investorTable_KqPd6 > div:nth-child({i}) > a > div'
search = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, css)))
if search is None:
driver.quit()
break
# search를 클릭하거나 다른 동작 수행
search.click()
get_company_details()
i += 1
time.sleep(5)
if(i==283):
break
# print([str(i)+" 번째에서 전체 엑셀로 저장"])
# excel_name = "MercuryDatabase444"
# data_frame = pd.DataFrame(results)
# data_frame.to_excel('{}.xlsx'.format(excel_name),sheet_name='{}'.format(excel_name),startrow=0,header=True)
except NoSuchElementException:
print("END.")
break
# DB 연동
# dynamo 연결
dynamodb=boto3.resource('dynamodb')
table_name='LUCK4_MERCURY_DB'
#DynamoDB에 데이터 삽입
for i, result in enumerate(results):
table = dynamodb.Table(table_name)
item = {
'invest_id': str(i + 279),
'name': result.get('name', 'null'),
'role': result.get('role', 'null'),
'type': result.get('type', 'null'),
'bio': result.get('bio', 'null'),
'stages': result.get('stages', 'null'),
'geography': result.get('geography', 'null'),
'checkrange': result.get('checkrange', 'null'),
'industries': result.get('industries', 'null'),
'aboutinvest': result.get('aboutinvest', 'null'),
'twitterlink': result.get('twitterlink', 'null'),
'linkedinlink': result.get('linkedinlink', 'null'),
'fundlink': result.get('fundlink', 'null'),
'email': result.get('email', 'null')
}
table.put_item(Item=item)
# elastic 연결
cloud_id = 'univ=============================='
username = 'e==========='
password = 'wNl==========='
# Elasticsearch 클러스터 연결
es = Elasticsearch(
cloud_id=cloud_id,
basic_auth=(username, password),
)
# 데이터 Elasticsearch에 색인
index_name = 'luck4_db_elastic'
for data_to_index in results:
res = es.index(index=index_name, document=data_to_index)