Selenium (Mercury - 해외 투자사 정보)

1. “Mercury”에서 우리가 가져와야할 데이터 - “해외 투자자 정보”

데이터 구조는 아래와 같다

# 데이터 구조
data_structure=[
    name:"",
    role:"",
    type:"",
    bio:"",
    stages:"",
    geography:"",
    checkrange:"",
    industries:[
        "",
        "",
        ""
    ],
    aboutinvest:"",
    linkedinlink:"",
    fundlink:"",
    email:""
]

가져와야할 데이터 화면으로 살펴보기

2. 코드

get_company_details() 함수 코드

아이템 하나에 들어간 화면부터 시작하며 현재 화면에서 항목들을 가져오고 다시 back 하는 과정까지 포함

# 함수 
def get_company_details():
    data={}
    try:
        # name
        name_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > h1')
        for element in name_elements:
            data['name'] = element.text

        #role
        role_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div.styles__twoColumn_oi18L > div:nth-child(1) > p.styles__regularValue_qkp_Y')
        for element in role_elements:
            data['role'] = element.text

        #type
        type_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div.styles__twoColumn_oi18L > div:nth-child(2) > p.styles__regularValue_qkp_Y')
        for element in type_elements:
            data['type'] = element.text
        
        #bio
        bio_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(2) > p.styles__regularValue_qkp_Y')
        for element in bio_elements:
            data['bio'] = element.text

        #stages
        stages_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div.styles__twoColumn_oi18L > div:nth-child(1) > p.styles__largeValue_fWvfH')
        for element in stages_elements:
            data['stages'] = element.text
        
        #geography
        geography_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div.styles__twoColumn_oi18L > div:nth-child(2) > p.styles__largeValue_fWvfH')
        for element in geography_elements:
            data['geography'] = element.text
        
        #checkrange
        checkrange_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(2) > p.styles__hugeValue_vB1_k')
        for element in checkrange_elements:
            data['checkrange'] = element.text

        #industries
        # 1. See All 버튼 누르는 코드 작성 (js로 강제로 눌리게 한다 - 요소 겹침 문제)
        button_css = '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(3) > div > button'

        try:
            button_element = driver.find_element(By.CSS_SELECTOR, button_css)
            driver.execute_script("arguments[0].click();", button_element)
        except NoSuchElementException:
            print("Button element not found. Skipping...")

     

        # 2. 가져오기
        div_count = len(driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(3) > div > div'))
        print("하위 div 태그 수:", div_count)

        data['industries'] = []
        for i in range(1, div_count+ 1):
            industries_css = f'#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(3) > div > div:nth-child({i})'
            industries_elements = driver.find_elements(By.CSS_SELECTOR, industries_css)
            for element in industries_elements:
                data['industries'].append(element.text)
            
    

        #aboutinvest 
        aboutinvest_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section.styles__sectionWrapper_T8QXL.styles__grid_I8EVa.styles__investmentsSection_VwAYB > div > div:nth-child(4) > p.styles__regularValue_qkp_Y')
        for element in aboutinvest_elements:
            data['aboutinvest'] = element.text

        #twitter
        try:
            twitterlink_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(3) > div > span:nth-child(1) > a')
            for element in twitterlink_elements:
                href=element.get_attribute('href')
                data['twitterlink'] = href
        except NoSuchElementException:
            data['twitterlink'] = 'null'
            print("Twitter element not found. Skipping...")

        #linkedinlink 
        try:
            linkedinlink_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(3) > div > div > a')
            for element in linkedinlink_elements:
                href=element.get_attribute('href')
                data['linkedinlink'] = href
        except NoSuchElementException:
            data['linkedinlink']='null'
            print("LinkedIn element not found. Skipping...")

        #fundlink 
        try:
            fundlink_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(5) > div > div:nth-child(3) > div > span > a')
            for element in fundlink_elements:
                href=element.get_attribute('href')
                data['fundlink'] = href
        except NoSuchElementException:
            data['fundlink'] = 'null'
            print("FundLink element not found. Skipping...")
        
        #email 
        email_elements = driver.find_elements(By.CSS_SELECTOR, '#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > section:nth-child(7) > div > div.styles__getInTouchContainer_ElM5P > div:nth-child(1) > span:nth-child(4) > a')
        for element in email_elements:
            href=element.get_attribute('href')
            data['email'] = href

        results.append(data)
        # print(results)
        # 이전으로 돌아가는 버튼 누르는 코드 작성!
        back_css='#root > main > div.styles__femLayoutWrapper_LCw8z > div.styles__wrapper_nF15Q.styles__grid_I8EVa > span > a'
        back=WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, back_css)))
        back.click()
        # input()
        time.sleep(3)
        return
        
        
    except Exception as e:
        print(e)
    finally:
        return
        driver.quit()

전체 루프

# 1-1. 페이지 이동(전체 페이지 반복문임)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

i = 2
while True:
    try:
        print("******** "+str(i)+"번째 항목 *************")
        css = f'#investor-table > div.styles__investorTable_KqPd6 > div:nth-child({i}) > a > div'
        search = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, css)))
        if search is None:
            driver.quit()
            break
        # search를 클릭하거나 다른 동작 수행
        search.click()
        get_company_details()
        i += 1

        time.sleep(5)
				# 엑셀로 저장
        # excel_name = "MercuryDatabase"
        # data_frame = pd.DataFrame(results)
        # data_frame.to_excel('{}.xlsx'.format(excel_name),sheet_name='{}'.format(excel_name),startrow=0,header=True)
    
    except NoSuchElementException:
            print("END.")
            break

DB연동

# DB 연동 

# dynamo 연결
dynamodb=boto3.resource('dynamodb') 
table_name='LUCK4_MERCURY_DB'

#DynamoDB에 데이터 삽입
for i, result in enumerate(results):
    table = dynamodb.Table(table_name)
    item = {
        'invest_id': str(i + 279),
        'name': result.get('name', 'null'),
        'role': result.get('role', 'null'),
        'type': result.get('type', 'null'),
        'bio': result.get('bio', 'null'),
        'stages': result.get('stages', 'null'),
        'geography': result.get('geography', 'null'),
        'checkrange': result.get('checkrange', 'null'),
        'industries': result.get('industries', 'null'),
        'aboutinvest': result.get('aboutinvest', 'null'),
        'twitterlink': result.get('twitterlink', 'null'),
        'linkedinlink': result.get('linkedinlink', 'null'),
        'fundlink': result.get('fundlink', 'null'),
        'email': result.get('email', 'null')
    }
    table.put_item(Item=item)

# elastic 연결
cloud_id = 'univ_pjt_4t:YXAtbm9ydGhlYXN0LTIuYXdzLmVsYXN0aWMtY2xvdWQuY29tOjQ0MyQzMjJkNWI0N2Y4OTU0OTg3YTFiZWUwNDY5MTRkYjU0YSRjZTBlNDdmZjU5YTg0NjBlOWQ0ZjAxZTVkZTg1M2NhNw=='
username = 'elastic'
password = 'wNlaIjGkybLEShB8gJ3lXpRM'

# Elasticsearch 클러스터 연결
es = Elasticsearch(
    cloud_id=cloud_id,
    basic_auth=(username, password),
 )
# 데이터 Elasticsearch에 색인
index_name = 'luck4_db_elastic'
for data_to_index in results:
    res = es.index(index=index_name, document=data_to_index)