데이터 구조는 아래와 같다
# 데이터 구조
data_structure=[
name:"",
role:"",
type:"",
bio:"",
stages:"",
geography:"",
checkrange:"",
industries:[
"",
"",
""
],
aboutinvest:"",
linkedinlink:"",
fundlink:"",
email:""
]
get_company_details() 함수 코드
# 함수
def get_company_details():
data={}
try:
# name
name_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > h1')
for element in name_elements:
data['name'] = element.text
#role
role_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div.styles__twoColumn_oi18L > div:nth-child(1) > p.styles__regularValue_qkp_Y')
for element in role_elements:
data['role'] = element.text
#type
type_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div.styles__twoColumn_oi18L > div:nth-child(2) > p.styles__regularValue_qkp_Y')
for element in type_elements:
data['type'] = element.text
#bio
bio_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(2) > p.styles__regularValue_qkp_Y')
for element in bio_elements:
data['bio'] = element.text
#stages
stages_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div.styles__twoColumn_oi18L > div:nth-child(1) > p.styles__largeValue_fWvfH')
for element in stages_elements:
data['stages'] = element.text
#geography
geography_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div.styles__twoColumn_oi18L > div:nth-child(2) > p.styles__largeValue_fWvfH')
for element in geography_elements:
data['geography'] = element.text
#checkrange
checkrange_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(2) > p.styles__hugeValue_vB1_k')
for element in checkrange_elements:
data['checkrange'] = element.text
#industries
# 1. See All 버튼 누르는 코드 작성 (js로 강제로 눌리게 한다 - 요소 겹침 문제)
button_css = '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(3) > div > button'
try:
button_element = driver.find_element(By.CSS_SELECTOR, button_css)
driver.execute_script("arguments[0].click();", button_element)
except NoSuchElementException:
print("Button element not found. Skipping...")
# 2. 가져오기
div_count = len(driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(3) > div > div'))
print("하위 div 태그 수:", div_count)
data['industries'] = []
for i in range(1, div_count+ 1):
industries_css = f'#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(3) > div > div:nth-child({i})'
industries_elements = driver.find_elements(By.CSS_SELECTOR, industries_css)
for element in industries_elements:
data['industries'].append(element.text)
#aboutinvest
aboutinvest_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(4) > p.styles__regularValue_qkp_Y')
for element in aboutinvest_elements:
data['aboutinvest'] = element.text
#twitter
try:
twitterlink_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(3) > div > span:nth-child(1) > a')
for element in twitterlink_elements:
href=element.get_attribute('href')
data['twitterlink'] = href
except NoSuchElementException:
data['twitterlink'] = 'null'
print("Twitter element not found. Skipping...")
#linkedinlink
try:
linkedinlink_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(3) > div > div > a')
for element in linkedinlink_elements:
href=element.get_attribute('href')
data['linkedinlink'] = href
except NoSuchElementException:
data['linkedinlink']='null'
print("LinkedIn element not found. Skipping...")
#fundlink
try:
fundlink_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(3) > div > span > a')
for element in fundlink_elements:
href=element.get_attribute('href')
data['fundlink'] = href
except NoSuchElementException:
data['fundlink'] = 'null'
print("FundLink element not found. Skipping...")
#email
email_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(7) > div > div.styles__getInTouchContainer_ElM5P > div:nth-child(1) > span:nth-child(4) > a')
for element in email_elements:
href=element.get_attribute('href')
data['email'] = href
results.append(data)
# print(results)
# 이전으로 돌아가는 버튼 누르는 코드 작성!
back_css='#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > span > a'
back=WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, back_css)))
back.click()
# input()
time.sleep(3)
return
except Exception as e:
print(e)
finally:
return
driver.quit()
전체 루프
# 1-1. 페이지 이동(전체 페이지 반복문임)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
i = 2
while True:
try:
print("******** "+str(i)+"번째 항목 *************")
css = f'#investor-table > div.styles__investorTable_KqPd6 > div:nth-child({i}) > a > div'
search = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, css)))
if search is None:
driver.quit()
break
# search를 클릭하거나 다른 동작 수행
search.click()
get_company_details()
i += 1
time.sleep(5)
# 엑셀로 저장
# excel_name = "MercuryDatabase"
# data_frame = pd.DataFrame(results)
# data_frame.to_excel('{}.xlsx'.format(excel_name),sheet_name='{}'.format(excel_name),startrow=0,header=True)
except NoSuchElementException:
print("END.")
break
DB연동
# DB 연동
# dynamo 연결
dynamodb=boto3.resource('dynamodb')
table_name='LUCK4_MERCURY_DB'
#DynamoDB에 데이터 삽입
for i, result in enumerate(results):
table = dynamodb.Table(table_name)
item = {
'invest_id': str(i + 279),
'name': result.get('name', 'null'),
'role': result.get('role', 'null'),
'type': result.get('type', 'null'),
'bio': result.get('bio', 'null'),
'stages': result.get('stages', 'null'),
'geography': result.get('geography', 'null'),
'checkrange': result.get('checkrange', 'null'),
'industries': result.get('industries', 'null'),
'aboutinvest': result.get('aboutinvest', 'null'),
'twitterlink': result.get('twitterlink', 'null'),
'linkedinlink': result.get('linkedinlink', 'null'),
'fundlink': result.get('fundlink', 'null'),
'email': result.get('email', 'null')
}
table.put_item(Item=item)
# elastic 연결
cloud_id = 'univ_pjt_4t:YXAtbm9ydGhlYXN0LTIuYXdzLmVsYXN0aWMtY2xvdWQuY29tOjQ0MyQzMjJkNWI0N2Y4OTU0OTg3YTFiZWUwNDY5MTRkYjU0YSRjZTBlNDdmZjU5YTg0NjBlOWQ0ZjAxZTVkZTg1M2NhNw=='
username = 'elastic'
password = 'wNlaIjGkybLEShB8gJ3lXpRM'
# Elasticsearch 클러스터 연결
es = Elasticsearch(
cloud_id=cloud_id,
basic_auth=(username, password),
)
# 데이터 Elasticsearch에 색인
index_name = 'luck4_db_elastic'
for data_to_index in results:
res = es.index(index=index_name, document=data_to_index)