Extracting Canadian Stocks Information from Yahoo Finance

Importing packages

In [1]:
# importing packages
import datetime
import df2gspread as d2g
import glob
import gspread
import gspread_dataframe as gd
import itertools
import pandas as pd
import numpy as np
import pickle
import pymysql
import re
import requests
import os
import sys

from bs4 import BeautifulSoup
from datetime import date, timedelta, datetime
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow,Flow
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials
from IPython.core.display import clear_output
from oauth2client.service_account import ServiceAccountCredentials
from pandas import json_normalize
from pprint import pprint as pp
from requests import get
from requests.packages import urllib3
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.color import Color
from selenium.common.exceptions import NoSuchElementException
from time import sleep
from tqdm import tqdm
from warnings import warn

Activating Selenium to collecte stock codes

Below code will activate a new Yahoo screener to filter for stocks in Canada and with Market Cap of greater than $75M. You can change the country and market cap below. Download the chromedriver from this link. Make sure the chrome and driver versions are same.

In [2]:
# activates widnow with below link  
driver = webdriver.Chrome()
driver.get('https://ca.finance.yahoo.com/screener/new')
sleep(10)

# remove US selection
us_slection_xpath = '//*[@id="screener-criteria"]/div[2]/div[1]/div[1]/div[1]/div/div[2]/ul/li[1]'
driver.find_element_by_xpath(us_slection_xpath).click()
sleep(5)

#click on Add region button
add_region_xpath = '//*[@id="screener-criteria"]/div[2]/div[1]/div[1]/div[1]/div/div[2]/ul/li'
driver.find_element_by_xpath(add_region_xpath).click()
sleep(5)

#selecting Canada button
canada_box_xpath = '//*[@id="dropdown-menu"]/div/div[2]/ul/li[7]/label/span'
driver.find_element_by_xpath(canada_box_xpath).click()
sleep(5)

#click on filter button
add_filter_xpath = '//*[@id="screener-criteria"]/div[2]/div[1]/div[1]/div[6]/button/span'
driver.find_element_by_xpath(add_filter_xpath).click()
sleep(5)

#check market cap
market_cap_xpath = '//*[@id="screener-criteria"]/div[2]/div[1]/div[1]/div[6]/div/div/div[2]/div[2]/div/ul/li[18]/label/span'
driver.find_element_by_xpath(market_cap_xpath).click()
sleep(5)

#close filter tab
close_button_xpath = '//*[@id="screener-criteria"]/div[2]/div[1]/div[1]/div[7]/div/div/div[3]/button'
driver.find_element_by_xpath(close_button_xpath).click()
sleep(5)

#input 75M market cap
market_cap_box_xpath = '//*[@id="screener-criteria"]/div[2]/div[1]/div[1]/div[6]/div/div[2]/input'
driver.find_element_by_xpath(market_cap_box_xpath).send_keys('75000000')
sleep(5)

#click on search
find_button_xpath = '//*[@id="screener-criteria"]/div[2]/div[1]/div[3]/button[1]'
driver.find_element_by_xpath(find_button_xpath).click()
sleep(5)
In [3]:
#getting current url
url = driver.current_url

#creating an empty list for code
Company_Code = []

#loop through code using bs4 
for i in tqdm(range(0, 1300, 100), position = 0):
    
    new_url = str(url) + '?count=100&offset=' + str(i)
    response = get(new_url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    
    for companies in html_soup.find_all('a', class_='Fw(600)'):
        Company_Code.append(companies.text)
100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:21<00:00,  1.64s/it]

Getting all data of the codes scraped above

The below code will scrape all the information available in the modules mentioned in the code for all the codes scraped below. I have listed the all the available Modules below:

  • defaultKeyStatistics
  • financialData
  • calendarEvents
  • assetProfile
  • summaryDetail
  • upgradeDowngradeHistory
  • recommendationTrend
  • earnings
  • price
  • recommendationTrend
  • cashflowStatementHistory
  • indexTrend
  • industryTrend
  • incomeStatementHistory
  • fundOwnership
  • insiderHolders
  • balanceSheetHistory
  • earningsTrend
  • secFilings
  • institutionOwnership
  • majorHoldersBreakdown
  • balanceSheetHistoryQuarterly
  • earningsHistory
  • majorDirectHolders
  • netSharePurchaseActivity
  • insiderTransactions
  • sectorTrend
  • incomeStatementHistoryQuarterly
  • cashflowStatementHistoryQuarterly
In [4]:
#creating list of all the raw data for all the codes
raw_data = []

#loop to collect data
for stock_code in tqdm(Company_Code, position = 0):
    params = {"formatted": "true",
                "crumb": "AKV/cl0TOgz",
                "lang": "en-US",
                "region": "US",
                "modules": "defaultKeyStatistics,financialData,price,summaryDetail,calendarEvents,assetProfile", #enter your modules here
                "corsDomain": "ca.finance.yahoo.com"}
    
    url = "https://query1.finance.yahoo.com/v10/finance/quoteSummary/{}".format(stock_code)
    r = requests.get(url, params=params)
    
    try:
        data = r.json()[u'quoteSummary']["result"][0]
    except (TypeError, ValueError) as e:
        pass
    
    df = json_normalize(data)
    raw_data.append(df)

#creating a dataframe of all the data scraped
all_raw_data_df = pd.concat(raw_data)
100%|████████████████████████████████████████████████████████████████████████████████| 927/927 [07:54<00:00,  1.95it/s]

Cleaning Data

In [5]:
#list of all the columns needed from the raw data
stock_column_dict = {
                    'price': ['regularMarketChangePercent.raw', 'regularMarketChange.raw', 'regularMarketPrice.raw', 'exchange', 
                              'exchangeName', 'quoteType', 'symbol', 'longName'],
                    'financialData': ['targetHighPrice.raw', 'targetLowPrice.raw', 'targetMeanPrice.raw', 'targetMedianPrice.raw', 
                                      'totalCash.raw','totalDebt.raw', 'recommendationKey'],
                    'summaryDetail': ['previousClose.raw', 'open.raw', 'dayLow.raw', 'dayHigh.raw', 'dividendRate.raw', 'dividendYield.raw',
                                       'payoutRatio.raw', 'fiveYearAvgDividendYield.raw', 'beta.raw', 'trailingPE.raw', 'forwardPE.raw',
                                       'volume.raw', 'averageVolume.raw', 'averageVolume10days.raw', 'marketCap.raw', 
                                       'fiftyTwoWeekLow.raw', 'fiftyTwoWeekHigh.raw', 'fiftyDayAverage.raw', 'twoHundredDayAverage.raw'],
                    'defaultKeyStatistics': ['enterpriseValue.raw', 'sharesOutstanding.raw', 'profitMargins.raw', 'pegRatio.raw', 
                                             '52WeekChange.raw', 'trailingEps.raw', 'forwardEps.raw', 'enterpriseToRevenue.raw', 
                                             'enterpriseToEbitda.raw', 'bookValue.raw','priceToBook.raw'],
                    'calendarEvents': ['earnings.earningsDate'],
                    'assetProfile': ['website', 'industry', 'sector', 'longBusinessSummary', 'city', 'state']
                    }

#creating a list of columns needed
columns_to_look = []
for key, value in (itertools.chain.from_iterable([itertools.product((k, ), v) for k, v in stock_column_dict.items()])): 
    columns_to_look.append(key + '.' + value)
    
#filter for columns in the list from raw data
selected_column_df = all_raw_data_df[all_raw_data_df.columns[all_raw_data_df.columns.isin(columns_to_look)]]

#renaming columns
selected_column_df.columns = selected_column_df.columns.str.split('.').str[1]

#formating calendarevents column
selected_column_df.rename(columns={'earnings':'earningsDate'}, inplace=True)

#moving date to first column
selected_column_df['extractDate'] = pd.to_datetime((datetime.now().strftime("%d-%m-%Y")), format="%d-%m-%Y").strftime('%m/%d/%Y')
column_name = 'extractDate'
first_column = selected_column_df.pop(column_name)
selected_column_df.insert(0, column_name, first_column)

#creating a duplicate df for final version
final_df = selected_column_df

#convert object columns to numeric
columns_to_convert = ['previousClose', 'open', 'dayLow', 'dayHigh', 'beta', 'trailingPE', 'volume', 
                      'averageVolume', 'averageVolume10days', 'marketCap', 'fiftyTwoWeekLow', 'fiftyTwoWeekHigh', 
                      'fiftyDayAverage', 'twoHundredDayAverage', 'regularMarketChangePercent', 
                      'regularMarketChange', 'regularMarketPrice', 'enterpriseValue', 'profitMargins', 
                      'sharesOutstanding', 'bookValue', 'priceToBook', 'trailingEps', 'enterpriseToRevenue', '52WeekChange', 
                      'totalCash', 'totalDebt', 'payoutRatio', 'forwardPE', 'forwardEps', 'enterpriseToEbitda', 
                      'targetHighPrice', 'targetLowPrice', 'targetMeanPrice', 'targetMedianPrice', 'dividendRate', 
                      'dividendYield', 'fiveYearAvgDividendYield', 'pegRatio']

final_df[columns_to_convert] = final_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')
C:\Users\saluj\Anaconda3\lib\site-packages\pandas\core\frame.py:4133: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
C:\Users\saluj\Anaconda3\lib\site-packages\ipykernel_launcher.py:33: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\saluj\Anaconda3\lib\site-packages\pandas\core\frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]

Uploading data to Google Sheets

Before you proceed, set up your authentication for google account by watching this video.

In [6]:
scope = ['https://spreadsheets.google.com/feeds', 
         'https://www.googleapis.com/auth/drive']

#enter your path to json credentials below 
path_to_json_credentials = r'' #enter json credential path here

credentials = ServiceAccountCredentials.from_json_keyfile_name(path_to_json_credentials, scope)
gc = gspread.authorize(credentials)
In [9]:
#enter your google workbook and worksheet name below 
workbook_name = "Canada Stocks"
worksheet_name = "Stocks EOD Data"

workbook = gc.open(workbook_name)
sheet = workbook.worksheet(worksheet_name)

Run the below cell only once if adding stocks first time

In [10]:
gd.set_with_dataframe(sheet, final_df)

Run below cell after you have used the above cell first time

In [ ]:
#importing the current df on google sheets
existing_google_df = gd.get_as_dataframe(sheet)

#appending the final df
updated = existing.append(final_df)

#uploading the df to google sheet
gd.set_with_dataframe(ws, updated_sorted)