[Cloud9환경] mercury.py #해외 투자사 정보 DynamoDB,Elastic에 저장

get_company_details(): 해외 투자사 정보
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd
from elasticsearch import Elasticsearch
import boto3
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains

chrome_options = Options()
chrome_options.add_argument('--headless')
    
print(ChromeDriverManager().install())
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# 메인 페이지 
driver.get('<https://mercury.com/investor-database#investor-table>')
driver.implicitly_wait(10)
col=["name","role","type","bio","stages","geography","checkrange","industries","aboutinvest","linkedinlink","fundlink","email"]
results=[]

# 화면 스크롤 함수 - 안씀!
def scroll_to_element(element):
    driver.execute_script("arguments[0].scrollIntoView(true);", element)

# 함수 
def get_company_details():
    data={}
    try:
        # name
        name_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > h1')
        for element in name_elements:
            data['name'] = element.text

        #role
        role_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div.styles__twoColumn_oi18L > div:nth-child(1) > p.styles__regularValue_qkp_Y')
        for element in role_elements:
            data['role'] = element.text

        #type
        type_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div.styles__twoColumn_oi18L > div:nth-child(2) > p.styles__regularValue_qkp_Y')
        for element in type_elements:
            data['type'] = element.text
        
        #bio
        bio_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(2) > p.styles__regularValue_qkp_Y')
        for element in bio_elements:
            data['bio'] = element.text

        #stages
        stages_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div.styles__twoColumn_oi18L > div:nth-child(1) > p.styles__largeValue_fWvfH')
        for element in stages_elements:
            data['stages'] = element.text
        
        #geography
        geography_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div.styles__twoColumn_oi18L > div:nth-child(2) > p.styles__largeValue_fWvfH')
        for element in geography_elements:
            data['geography'] = element.text
        
        #checkrange
        checkrange_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(2) > p.styles__hugeValue_vB1_k')
        for element in checkrange_elements:
            data['checkrange'] = element.text

        #industries
        # 1. See All 버튼 누르는 코드 작성 (js로 강제로 눌리게 한다 - 요소 겹침 문제)
        button_css = '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(3) > div > button'

        try:
            button_element = driver.find_element(By.CSS_SELECTOR, button_css)
            driver.execute_script("arguments[0].click();", button_element)
        except NoSuchElementException:
            print("Button element not found. Skipping...")

     

        # 2. 가져오기.. 
        div_count = len(driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(3) > div > div'))
        print("하위 div 태그 수:", div_count)

        data['industries'] = []
        for i in range(1, div_count+ 1):
            industries_css = f'#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(3) > div > div:nth-child({i})'
            industries_elements = driver.find_elements(By.CSS_SELECTOR, industries_css)
            for element in industries_elements:
                data['industries'].append(element.text)
            
    

        #aboutinvest 
        aboutinvest_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(4) > p.styles__regularValue_qkp_Y')
        for element in aboutinvest_elements:
            data['aboutinvest'] = element.text

        #twitter
        try:
            twitterlink_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(3) > div > span:nth-child(1) > a')
            for element in twitterlink_elements:
                href=element.get_attribute('href')
                data['twitterlink'] = href
        except NoSuchElementException:
            data['twitterlink'] = 'null'
            print("Twitter element not found. Skipping...")

        #linkedinlink 
        try:
            linkedinlink_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(3) > div > div > a')
            for element in linkedinlink_elements:
                href=element.get_attribute('href')
                data['linkedinlink'] = href
        except NoSuchElementException:
            data['linkedinlink']='null'
            print("LinkedIn element not found. Skipping...")

        #fundlink 
        try:
            fundlink_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(3) > div > span > a')
            for element in fundlink_elements:
                href=element.get_attribute('href')
                data['fundlink'] = href
        except NoSuchElementException:
            data['fundlink'] = 'null'
            print("FundLink element not found. Skipping...")
        
        #email 
        email_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(7) > div > div.styles__getInTouchContainer_ElM5P > div:nth-child(1) > span:nth-child(4) > a')
        for element in email_elements:
            href=element.get_attribute('href')
            data['email'] = href

        results.append(data)
        print(results)
        # 이전으로 돌아가는 버튼 누르는 코드 작성!
        back_css='#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > span > a'
        back=WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, back_css)))
        back.click()
        # input()
        time.sleep(3)
        return
        
        
    except Exception as e:
        print(e)
    finally:
        return
        driver.quit()

# 1-1. 페이지 이동(전체 페이지 반복문임)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

i = 280
while True:
    try:
        print("******** "+str(i)+"번째 항목 *************")
        css = f'#investor-table > div.styles__investorTable_KqPd6 > div:nth-child({i}) > a > div'
        search = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, css)))
        if search is None:
            driver.quit()
            break
        # search를 클릭하거나 다른 동작 수행
        search.click()
        get_company_details()
        i += 1

        time.sleep(5)
        if(i==283):
            break
        # print([str(i)+" 번째에서 전체 엑셀로 저장"])
        # excel_name = "MercuryDatabase444"
        # data_frame = pd.DataFrame(results)
        # data_frame.to_excel('{}.xlsx'.format(excel_name),sheet_name='{}'.format(excel_name),startrow=0,header=True)
    
    except NoSuchElementException:
            print("END.")
            break
            

# DB 연동 

# dynamo 연결
dynamodb=boto3.resource('dynamodb') 
table_name='LUCK4_MERCURY_DB'

#DynamoDB에 데이터 삽입
for i, result in enumerate(results):
    table = dynamodb.Table(table_name)
    item = {
        'invest_id': str(i + 279),
        'name': result.get('name', 'null'),
        'role': result.get('role', 'null'),
        'type': result.get('type', 'null'),
        'bio': result.get('bio', 'null'),
        'stages': result.get('stages', 'null'),
        'geography': result.get('geography', 'null'),
        'checkrange': result.get('checkrange', 'null'),
        'industries': result.get('industries', 'null'),
        'aboutinvest': result.get('aboutinvest', 'null'),
        'twitterlink': result.get('twitterlink', 'null'),
        'linkedinlink': result.get('linkedinlink', 'null'),
        'fundlink': result.get('fundlink', 'null'),
        'email': result.get('email', 'null')
    }
    table.put_item(Item=item)

# elastic 연결
cloud_id = 'univ=============================='
username = 'e==========='
password = 'wNl==========='

# Elasticsearch 클러스터 연결
es = Elasticsearch(
    cloud_id=cloud_id,
    basic_auth=(username, password),
 )
# 데이터 Elasticsearch에 색인
index_name = 'luck4_db_elastic'
for data_to_index in results:
    res = es.index(index=index_name, document=data_to_index)