Selenium (혁신의숲 - Portfolio Data)

1. “혁신의 숲”에서 우리가 가져와야할 두번째 데이터 - “Portfolio Data”

각 투자사마다 투자한 스타트업들의 기록을 가져와야 했다
추후의 투자사들의 성향을 파악하기 위해서는 “중복 투자 날짜” 및 “최근의 투자 일수록 가중치”를 더 주기 위해 꼭 필요한 정보들이 있기에 크롤링 하게 되었다
데이터 항목
- 스타트업 이름
- 투자 날짜
- 투자 금액
- 투자 단계
크롤링 순서 화면으로 보기

2. 크롤링 코드

# from selenium import webdriver
# from selenium.webdriver.common.keys import Keys
# from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time

# 브라우저 꺼짐 방지 옵션
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

driver.get('<https://www.innoforest.co.kr/dataroom/investor>')
driver.implicitly_wait(10)

##### 상페 포트폴리오 기록들 가져오는 함수 #####
def get_investor_portfolio():
    for div_count in range(1,7):
        date_list = [] # '날짜' 정보 저장 (중복투자로 인한 배열 저장)
        stage_list = [] #'투자 단계' 정보 저장 (중복투자로 인한 배열 저장)
        money_list = [] # '투자 금약' 정보 저장 (중복투자로 인한 배열 저장)
        inside= {'investor': investor_name} #각 투자사의 포트폴리오 정보들을 저장하기 우히나 딕셔너리 생성

        # 스타트업 이름
        name = driver.find_element(By.XPATH,f'//*[@id="investor3"]/div/div/div[2]/div/div[2]/div/div[1]/div[2]/div[1]/div[2]/div[{div_count}]/div[1]/div/div/div/span')
        inside['name'] = name.text

        # 투자한 날짜
        dates = driver.find_elements(By.XPATH,f'//*[@id="investor3"]/div/div/div[2]/div/div[2]/div/div[1]/div[2]/div[1]/div[2]/div[{div_count}]/div[2]/div/div')
        for date in dates:
            date_list.append(date.text)
            inside['dates'] = date_list

        # 투자 당시 스타트업 '투자 단계'
        stages = driver.find_elements(By.XPATH,f'//*[@id="investor3"]/div/div/div[2]/div/div[2]/div/div[1]/div[2]/div[1]/div[2]/div[{div_count}]/div[3]/div/div')
        for stage in stages:
            stage_list.append(stage.text)
            inside['stage'] = stage_list

        # 투자 금약
        moneys = driver.find_elements(By.XPATH,f'//*[@id="investor3"]/div/div/div[2]/div/div[2]/div/div[1]/div[2]/div[1]/div[2]/div[{div_count}]/div[4]/div/div')
        for money in moneys:
            money_list.append(money.text)
            inside['money'] = money_list

        # 투자사마다 정리한 포트폴리오를 한 배열에 다 저장하기
        list_of_dict.append(inside)

##### 로그인 #####
# 1. 로그인 버튼 누르기
login_button_xpath =f'//*[@id="__next"]/header/div[1]/div/ul/li[1]/a'
login_button=WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, login_button_xpath)))
login_button.click()
# 2. 아이디 비번 입력
# 아이디 비밀번호 입력 필드 찾기
username_input = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div/div/form/div[1]/input')
password_input = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div/div/form/div[2]/input')
# 아이디와 비밀번호 입력
username_input.send_keys("[email protected]")
password_input.send_keys("@isookim0325")
# 엔터키 누르기
password_input.submit()
# 투자자 화면으로 이동하는 것까지
driver.get('<https://www.innoforest.co.kr/dataroom/investor>')

##### 전역변수 #####
total_total_data = {}
total_data = {}
total_count = 0

total_data_list= []
list_of_dict = []

item_count = 1 # 항목 # 테스트용 20
page_count = 1 # 페이징 
page_count_2 = 1
page_count_3 = 6

##### 각 투자사마다 루프 돌려서 하나씩 접근하기 #####
while True:
    # 1-1.다음 아이템을 선택하는 try-catch
    try:

        # 한 페이지를 모두 읽어서 다음 페이지로 넘겨야 하는 경우의 try-catch
        try:
            # item(투자사) 돌아가며 클릭해 내부 페이지로 이동
            print("******************** 전체의 "+str(item_count)+"번째 항목 *************************")
            next_item_xpath = f'//*[@id="__next"]/main/div[1]/div[2]/div/article/table/tbody/tr[{item_count}]/td[1]'
            next_item = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, next_item_xpath)))

        # 끝까지 내려왔다면 == next_item이 없다면 > 다음 페이지를 눌러야함 1,2,3,4,5 페이지 
        except TimeoutException:
            
            # 페이징 5번째까지 완료했을 경우 옆의 "화살표 누르기"
            if(page_count_2==6 and page_count_3 == 6):
                try:
                    print("not next_item 내부로 들어옴")
                    next_page_button_xpath='//*[@id="__next"]/main/div/div[2]/div/article[2]/div[3]/div[6]/a'
                    next_page_button=WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, next_page_button_xpath)))
                    next_page_button.click()
                    page_count = 3
                    page_count_2 = 3
                    page_count_6=8
                except TimeoutException:
                    print("6 the end")
                    break
            
            elif (page_count == 8):

                try:
                    next_page_xpath=f'///*[@id="__next"]/main/div/div[2]/div/article[2]/div[3]/div[8]/a'
                    next_page = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, next_page_xpath)))
                    next_page.click()
                except TimeoutException:
                    print("8 the end")
                    break
            
            # 다음으로 넘길 필요 없은 각 페이지 누르기
            else:
                print("여기 들어옴")
                # 다음 페이지로 넘어가기
                # page_count+=4
                next_page_xpath=f'//*[@id="__next"]/main/div/div[2]/div/article[2]/div[3]/div[{page_count_2}]/span'
                next_page = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, next_page_xpath)))
                next_page.click()
                page_count += 1
                page_count_2 += 1

            # 새로운 페이지 이므로 item_count 재설정하고 next_item_css 재설정
            item_count = 1 # 테스트용 20
            next_item_xpath = f'//*[@id="__next"]/main/div[1]/div[2]/div/article/table/tbody/tr[{item_count}]/td[1]'
            next_item = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, next_item_xpath)))

        # 페이지 번호 클릭 / 각 투자사상세페이지 클릭
        next_item.click()
        print("next_item")
        item_count += 1

        
        ##### 투자사 이름 가져오기
        name_element = driver.find_element(By.XPATH, '//*[@id="modal-wrapper"]/div/div[2]/div/div/div[3]/div/div[2]/div[2]/div[3]/a[1]/div/div[2]/div[1]/div[1]/span')
        investor_name = name_element.text
        

        ##### 상세페이지 내  "포트폴리오 전체보기" 버튼 눌러서 "포트폴리오" 기록 접근하기
        portfolio_xpath = f'//*[@id="investor3"]/div/div/div/div[2]/div[3]/div[2]/button/div/div/div[1]'
        portfolio = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH, portfolio_xpath)))
        portfolio.click() 
        time.sleep(3)
        print("portfolio")

        ##### 포트폴리오 상세 페이지들 루프 돌리면서 페이지 넘기기 #####
        item_count2 = 1
        page_count2 = 1

        while True:
            try:
                
                #### 밑에 동그라미 버트으로 페이지들 누르기 ####
                print("******** "+str(item_count2)+"번째 항목 *************")
                next_item_xpath = f'//*[@id="investor3"]/div/div/div[2]/div/div[2]/div/div[2]/div/div[{item_count2}]/span'
                next_item = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, next_item_xpath)))
                next_item.click()
                time.sleep(1)
                item_count2 += 1
                page_count2 += 1
                print(page_count2)

                ############ 포트폴리오 정보 가져오기 ################
                get_investor_portfolio()
                ############ 포트폴리오 정보 가져오기 ################

            # "화살표" 눌러서 넘어가기
            # 예) 1-5페이지 정보를 모두 크롤링 후  6-10 페이지 기록으로 넘어가기 위한 "화살표"눌러서 넘어가기  
            except TimeoutException:
                print("end starting at" + str(page_count2))

                if(page_count2 == 6):
                    print("yess 6")

                    try:
                        next_page_button_xpath = f'//*[@id="investor3"]/div/div/div[2]/div/div[2]/div/div[2]/div/div[6]/a'
                        next_page_button=WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, next_page_button_xpath)))
                        next_page_button.click()
                        time.sleep(1)
                        item_count2 = 3
                        page_count2 = 3
                    except TimeoutException:
                        print("6 the end")
                        break

                elif (page_count2 == 8):
                    print("yesss 8 ")

                    try:
                        next_page_button_xpath = f'//*[@id="investor3"]/div/div/div[2]/div/div[2]/div/div[2]/div/div[8]/a'
                        next_page_button=WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, next_page_button_xpath)))
                        next_page_button.click()
                        time.sleep(1)
                        item_count2 = 3
                        page_count2 = 3
                    except TimeoutException:
                        print("8 the end")
                        break
                
                else:

                    print("else end")
                    break

        driver.back()
        time.sleep(3)
        print("back")

    except NoSuchElementException: # 아이템이 발견되지 않을때
            print("going back to driver")
            driver.back()