Finance Data & Analysis Tools Using Python
Last Updated: February 15, 2021 by Pepe Sandoval
If you find the information in this page useful and want to show your support, you can make a donation
Use PayPal
This will help me create more stuff and fix the existent content...
A Portfolio is a set of allocations in a variety of assets. In order words is a set of weighted assets. E.x. you have 0.1 of your money in BTC, 0.2 in AMZN stocks, 0.3 in AAPL stocks and 0.4 in an ETF
Key statistics in a portfolio
Quantifies the relationships between mean daily return and the std. daily return (volatility)
It is a measure of risk or more formally a measure of the risk-adjusted return
Formula $SR = \dfrac{R_p - R_f}{\sigma_p}$
Annualized Sharpe Ratio (ASR):
>=1
are considered good values, around >=2
is very good and >=3
is considered excellentRandomly guessing and checking allocations based on a the values of another statistics in general is known as Monte Carlo Simulation
Minimization its just finding the value of independent variable ($x$) that gives us the minimum value of the dependent variable ($y$)
Finding the optimal value for a metric can be done using Optimization algorithm which are based on Minimization
We want to maximize Sharpe Ratio which means we can create an optimizer that attempts to minimize the negative Sharpe Ratio (inverse)
Border/Efficient Frontier indicates the highest return for a certain value of volatility
import os import seaborn as sns import numpy as np import pandas as pd import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt base_path = "/home/ubuntu/" YOUR_QUANDL_API_KEY_HERE = "QQQQQQQQQQQQQ" start, end = pd.to_datetime("2012-01-01"), pd.to_datetime("2017-01-01") # Get data aapl = pd.read_csv(os.path.join(base_path, "AAPL.csv"), sep=",", index_col='Date', parse_dates=True) #appl = quandl.get("WIKI/AAPL.11", start_date=start, end_date=end, api_key=YOUR_QUANDL_API_KEY_HERE) aapl.name = "AAPL" cisco = pd.read_csv(os.path.join(base_path, "CISCO.csv"), sep=",", index_col='Date', parse_dates=True) #cisco = quandl.get("WIKI/CSCO.11", start_date=start, end_date=end, api_key=YOUR_QUANDL_API_KEY_HERE) cisco.name = "CISCO" amzn = pd.read_csv(os.path.join(base_path, "AMZN.csv"), sep=",", index_col='Date', parse_dates=True) #amzn = quandl.get("WIKI/AMZN.11", start_date=start, end_date=end, api_key=YOUR_QUANDL_API_KEY_HERE) amzn.name = "AMZN" ibm = pd.read_csv(os.path.join(base_path, "IBM.csv"), sep=",", index_col='Date', parse_dates=True) #ibm = quandl.get("WIKI/IBM.11", start_date=start, end_date=end, api_key=YOUR_QUANDL_API_KEY_HERE) ibm.name ="IBM" stock_dfs = (aapl, cisco, ibm, amzn) stocks_allocations = (0.3, 0.2, 0.4, 0.1) assert(sum(stocks_allocations) == 1.0) investment_amount = 10000 all_position_values = [] col_names = [] risk_free_rate = 0.0 for df, allo in zip(stock_dfs, stocks_allocations): df["Normed Return"] = df["Adj. Close"] / df.iloc[0]["Adj. Close"] df['Allocation'] = df["Normed Return"] * allo df["Position Values"] = df['Allocation'] * investment_amount all_position_values.append(df["Position Values"]) col_names.append("{} Pos".format(str(df.name))) portfolio = pd.concat(all_position_values, axis=1) portfolio.columns = col_names portfolio['Total Pos'] = portfolio.sum(axis=1) portfolio['Daily Returns'] = portfolio['Total Pos'].pct_change(1) average_daily_return = portfolio['Daily Returns'].mean() std_daily_return = portfolio['Daily Returns'].std() cumulative_return_percent = 100*(portfolio['Total Pos'][-1]/portfolio['Total Pos'][0] - 1) total_value = portfolio['Total Pos'][-1] SR = (average_daily_return - risk_free_rate)/std_daily_return ASR = (252**0.5) * SR # Print portfolio data #print(aapl.head()) ; print(cisco.head()) ; print(amzn.head()) ; print(ibm.head()) #print(aapl.tail()) ; print(cisco.tail()) ; print(amzn.tail()) ; print(ibm.tail()) #print(portfolio.head()) print("average_daily_return=", average_daily_return) print("std_daily_return=", std_daily_return) print("cumulative_return_percent=", cumulative_return_percent, "total_value=", total_value) print("SR=", SR, "ASR=", ASR) # Plot portfolio stuff #portfolio["Total Pos"].plot(figsize=(10,8)).get_figure().savefig(os.path.join(base_path, "total.png")) #portfolio.drop("Total Pos", axis=1).plot(figsize=(10,8)).get_figure().savefig(os.path.join(base_path, "positions.png")) #portfolio["Daily Returns"].plot(kind='hist', bins=100, figsize=(4,5)) #portfolio["Daily Returns"].plot(kind='kde', figsize=(4,5)).get_figure().savefig(os.path.join(base_path, "hist_and_kde.png"))
import os import seaborn as sns import numpy as np import pandas as pd import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt from scipy.optimize import minimize np.random.seed(101) risk_free_rate = 0.0 base_path = "/home/ubuntu/" YOUR_QUANDL_API_KEY_HERE = "QQQQQQQQQQQQQ" start, end = pd.to_datetime("2012-01-01"), pd.to_datetime("2017-01-01") # Get data aapl = pd.read_csv(os.path.join(base_path, "AAPL.csv"), sep=",", index_col='Date', parse_dates=True) #appl = quandl.get("WIKI/AAPL.11", start_date=start, end_date=end, api_key=YOUR_QUANDL_API_KEY_HERE) aapl.name = "AAPL" cisco = pd.read_csv(os.path.join(base_path, "CISCO.csv"), sep=",", index_col='Date', parse_dates=True) #cisco = quandl.get("WIKI/CSCO.11", start_date=start, end_date=end, api_key=YOUR_QUANDL_API_KEY_HERE) cisco.name = "CISCO" amzn = pd.read_csv(os.path.join(base_path, "AMZN.csv"), sep=",", index_col='Date', parse_dates=True) #amzn = quandl.get("WIKI/AMZN.11", start_date=start, end_date=end, api_key=YOUR_QUANDL_API_KEY_HERE) amzn.name = "AMZN" ibm = pd.read_csv(os.path.join(base_path, "IBM.csv"), sep=",", index_col='Date', parse_dates=True) #ibm = quandl.get("WIKI/IBM.11", start_date=start, end_date=end, api_key=YOUR_QUANDL_API_KEY_HERE) ibm.name ="IBM" stock_dfs = [aapl, cisco, ibm, amzn] stocks = pd.concat(stock_dfs, axis=1) stocks.columns = ["{}".format(str(df.name)) for df in stock_dfs] #print(stocks.head()) #print(stocks.pct_change(1).mean()) #print(stocks.pct_change(1).corr()) #print(stocks.pct_change(1).head()) # daily returns in arithmetic log_ret = np.log(stocks/stocks.shift(1)) # daily returns in log normalization #print(log_ret.head()) number_of_portfolios = 500 all_weights = np.zeros((number_of_portfolios, len(stocks.columns))) # matrix of 'number_of_portfolios' elemenst each eleements is a list of `len(stocks.columns)` elements expected_returns_array = np.zeros(number_of_portfolios) expected_volatility_array = np.zeros(number_of_portfolios) sharpe_ratio_arrays = np.zeros(number_of_portfolios) def get_returns_volatility_sharpe_ratio(weights): expected_returns = np.sum(log_ret.mean() * weights * 252) expected_volatility = np.sqrt(np.dot(weights.T, np.dot(log_ret.cov()*252, weights))) sharpe_ratio = (expected_returns-risk_free_rate)/expected_volatility return np.array([expected_returns, expected_volatility, sharpe_ratio]) def negative_sharpe(weights): return get_returns_volatility_sharpe_ratio(weights)[2] * -1 def check_sum(weights): return (np.sum(weights) - 1) # 1. Random allocation for portfolio_index in range(number_of_portfolios): weights = np.array(np.random.random(len(stocks.columns))) weights = weights/np.sum(weights) # normalize so they all sum 1 all_weights[portfolio_index, :] = weights ret = get_returns_volatility_sharpe_ratio(weights) expected_returns_array[portfolio_index] = ret[0] expected_volatility_array[portfolio_index] = ret[1] sharpe_ratio_arrays[portfolio_index] = ret[2] #print("index", portfolio_index, "weights=", weights) ; print("expected_return=",expected_returns_array[portfolio_index]) ; print("expected_volatility=",expected_volatility_array[portfolio_index]) ; print("sharpe_ratio=",sharpe_ratio_arrays[portfolio_index]) ; print("-"*10 +"\n") max_sr, max_sr_index = sharpe_ratio_arrays.max(), sharpe_ratio_arrays.argmax() max_sr_returns = expected_returns_array[max_sr_index] max_sr_volatility = expected_volatility_array[max_sr_index] #print("max SR", max_sr, "in index", max_sr_index, "optimal weights=", all_weights[max_sr_index], "max_sr_returns=", max_sr_returns, "max_sr_volatility", max_sr_volatility) # 2. Optimization scipy function constraints = ({"type": "eq", "fun": check_sum}) bounds = ((0,1), (0,1), (0,1), (0,1)) initial_guess = [0.25, 0.25, 0.25, 0.25] opt_results = minimize(fun=negative_sharpe, x0=initial_guess, method="SLSQP", bounds=bounds, constraints=constraints) print(opt_results) results_weighst = opt_results.x ret = get_returns_volatility_sharpe_ratio(results_weighst) max_sr_returns, max_sr_volatility, max_sr = ret[0], ret[1], ret[2] #print("max SR", max_sr, "weights", results_weighst, "max_sr_returns=", max_sr_returns, "max_sr_volatility", max_sr_volatility) # 3. Get optimal returns portfolios for levels of volatility frontier_y = np.linspace(0, 0.3, 25) def minimize_volatility(weights): return get_returns_volatility_sharpe_ratio(weights)[1] frontier_volatility = [] for possible_return in frontier_y: constrainsts = constraints = ({"type": "eq", "fun": check_sum}, {"type": "eq", "fun": lambda w: get_returns_volatility_sharpe_ratio(w)[0]-possible_return}) result = minimize(fun=minimize_volatility, x0=initial_guess, method="SLSQP", bounds=bounds, constraints=constraints) frontier_volatility.append(result['fun']) # Do all plotting plt.figure(figsize=(12,8)) plt.scatter(expected_volatility_array, expected_returns_array, c=sharpe_ratio_arrays, cmap='plasma') plt.colorbar(label="Sharpe Ratio") plt.xlabel("Volatility") plt.ylabel("Return") plt.scatter(max_sr_volatility, max_sr_returns, c="red", s=50, edgecolors="black") plt.plot(frontier_volatility, frontier_y, "b--", linewidth=3) plt.savefig(os.path.join(base_path, "sr_optimal.png"))
Liquidity refers to how easy can you get your money in and out of an investment, high liquidity means it is very easy, low liquidity means it is hard or you need to wait for a certain period of time until a contract date is reached for example
Some stocks pay Dividends, so for each stock a shareholder has, he receives some payout at certain defined date, this usually cause price to jump before dividend announcement and then drop after dividend pay out
Stock Splits usually occur if the price of an individual stock become really high, so a company creates a ratio split (e.g. 2:1, 3:1, 4:1, 5:1) this is the reason the Adj. Close price exists, which adjusts the historical prices to match up and take into account the stock splits, also takes into account dividends. This is the reason it is important to use adjusted close/open prices for historical analysis
Survivorship Bias: if you are using S&P500 as an indicator the time period you pick can matter since the list of companies has changed trough the years
EMH (Efficient Market Hypothesis) is an investment theory that stats it is impossible to "beat the market" so it is not possible for trades to purchase undervalued stocks or sell stocks for inflated prices
When you want to buy or sell and asset what is happening is that you are creating an Order in a broker, then it goes to an exchange (or multiple exchanges), once the exchange receives the order it goes in to an Order Book which is just a list of Buy and sell orders
Order Information:
Orders Scenarios:
The Capital Assets Pricing Model (CAPM) is a model that help describe risk and also helps to separate market return vs your portfolio return
We can define the return of a portfolio at some time $t$ as the following formula for $r_p(t)$ where:
$$r_p(t) = \sum_{i=1}^n w_i r_i(t)$$
$$w_i = \dfrac{\text{MarketCap}_i}{\sum_{j=1}^n \text{MarketCap}_j}$$
$$r_i(t) = \beta_i r_m(t) + \alpha_i(t)$$
$$r_p(t) = \beta_i r_m(t) + \alpha_i(t)$$
import os import datetime from scipy import stats import pandas as pd import pandas_datareader as web import matplotlib.pyplot as plt import numpy as np base_path = "." SYM = "AMZN" stock = pd.read_csv(os.path.join(base_path, "{}.csv".format(SYM)), index_col='Date', parse_dates=True) spy_etf = df = pd.read_csv(os.path.join(base_path, "SPY.csv"), sep=",", index_col='Date', parse_dates=True) #stock["close"].plot(label=SYM) #spy_etf["Close"].plot(label="SPY ETF") stock["cumulative"] = stock["close"]/stock["close"].iloc[0] stock["daily return"] = stock["close"].pct_change(1) spy_etf["Cumulative"] = spy_etf["Close"]/spy_etf["Close"].iloc[0] spy_etf["Daily Return"] = spy_etf["Close"].pct_change(1) # Compare cumulative returns # stock["cumulative"].plot(label=SYM) # spy_etf["Cumulative"].plot(label="SPY ETF") # Check correlation, if there is high correlation we expect to see a line # plt.scatter(stock["daily return"], spy_etf["Daily Return"], alpha=0.25) # High beta means asset behaves pretty much like the market beta, alpha, r_value, p_value, std_err = stats.linregress(stock["daily return"].iloc[1:], spy_etf["Daily Return"].iloc[1:]) print("CAPM for {}\nbeta".format(SYM), beta, "\nalpha", alpha, "\nr_value", r_value, "\np_value", p_value, "\nstd_err", std_err) # Simulate a stock that behaves a lot like the market noise = np.random.normal(0, 0.001, len(spy_etf["Daily Return"].iloc[1:])) fake_stock = spy_etf["Daily Return"].iloc[1:] + noise beta, alpha, r_value, p_value, std_err = stats.linregress(fake_stock, spy_etf["Daily Return"].iloc[1:]) print("CAPM for fake stock\nbeta", beta, "\nalpha", alpha, "\nr_value", r_value, "\np_value", p_value, "\nstd_err", std_err) # plt.scatter(fake_stock, spy_etf["Daily Return"].iloc[1:], alpha=0.25) # print(stock.head()) # print(spy_etf.head()) # plt.legend() # plt.show()
Blueshift® is a platform that allows coders to write, test and backtest investment algorithms
initialize
:
context
as inputcontext
is an object which is basically an extender dictionary used to maintain the status of your algorithm during the backtest or live tradinghandle_data
:
context
and data
as inputsdata
object with methods to adjust portfolio and check historical data
from datetime import datetime AAPL = 0 CSCO = 1 AMZN = 2 IBM = 3 # Zipline from zipline.api import( symbol, order_target_percent, schedule_function, date_rules, time_rules, record ) def initialize(context): """ A function to define things to do at the start of the strategy """ context.tech_stocks = [symbol("AAPL") , symbol("CSCO"), symbol("AMZN"), symbol("IBM")] ## Uncomment one of the following sets # 1 Record schedule_function(rebalance, date_rules.every_day(), time_rules.market_open()) schedule_function(record_vars, date_rules.every_day(), time_rules.market_close()) # 2 schedule to do something witha certain frequency, like adjust portfolio #schedule_function(open_positions, date_rules.week_start(), time_rules.market_open()) #schedule_function(close_positions, date_rules.week_end(), time_rules.market_close()) def rebalance(context, data): order_target_percent(context.tech_stocks[AMZN], 0.5) order_target_percent(context.tech_stocks[IBM], -0.5) #Short sell ibm def record_vars(context, data): record(amzn_close=data.current(context.tech_stocks[AMZN], 'close')) record(ibm_close=data.current(context.tech_stocks[IBM], 'close')) def open_positions(context, data): order_target_percent(context.tech_stocks[AAPL], 0.1) def close_positions(context, data): order_target_percent(context.tech_stocks[AAPL], 0) def handle_data(context, data): print(datetime.now().strftime("%d/%m/%Y %H:%M:%S")) print(data.is_stale(symbol("AAPL")), data.can_trade(symbol("AAPL"))) price_history = data.history(context.tech_stocks, fields="price", bar_count=5, frequency='1d') print(price_history) tech_close = data.current(context.tech_stocks, 'close') print(type(tech_close)) print(tech_close) print("-"*15 + "\n")
np.corrcoef(series1, series2)
and see if their correlation coefficient is close to 1## Research script pure python import os import seaborn as sns import numpy as np import pandas as pd import matplotlib import datetime import pandas_datareader.data as web matplotlib.use('agg') import matplotlib.pyplot as plt # takes a time series and normalizes it def zscore(stock): return (stock-stock.mean())/np.std(stock) base_path = "/home/ubuntu/" start, end = datetime.datetime(2015, 1, 1), datetime.datetime(2017, 1, 1) ual = pd.read_csv(os.path.join(base_path, "UAL.csv"), index_col='Date', parse_dates=True) american = pd.read_csv(os.path.join(base_path, "AMER.csv"), index_col='Date', parse_dates=True) corr_matrix = np.corrcoef(american['close'], ual['close']) #print(ual.head()) ; print(american.head()) print(corr_matrix) dif = american['close'] - ual['close'] spread = zscore(dif) spread_mavg1 = dif.rolling(1).mean() spread_mavg30 = dif.rolling(30).mean() std_30 = dif.rolling(30).std() zscore_30_1 = (spread_mavg1-spread_mavg30)/std_30 fig = plt.figure() ; american['close'].plot(label='AAL') ; ual['close'].plot(label='UAL'); plt.legend() ; fig.savefig(os.path.join(base_path, "airlines.png")) fig = plt.figure() ;spread.plot(label='Spread') ; plt.axhline(spread.mean(), c='b') ; plt.axhline(1.0, c='g', ls='--') ; plt.axhline(-1.0, c='r', ls='--') ; plt.legend() ; fig.savefig(os.path.join(base_path, "spread.png")) fig = plt.figure() ; zscore_30_1.plot(label='Rolling 30 day z score') ; plt.axhline(0, c='b', ls='--') ; plt.axhline(1.0, c='r', ls='--') ; fig.savefig(os.path.join(base_path, "zscore_30_1.png")) ## Blueshift Implementation import numpy as np # Zipline from zipline.api import( symbol, order_target_percent, schedule_function, date_rules, time_rules, record, ) def initialize(context): context.aa = symbol("AAL") context.ual = symbol("UAL") context.long_on_spread = False context.short_spread = False schedule_function(check_pairs, date_rules.every_day(), time_rules.market_close(minutes=60)) def check_pairs(context, data): aa = context.aa ual = context.ual prices = data.history([aa, ual], 'price', 30, '1d') short_prices = prices.iloc[-1:] mavg_30 = np.mean(prices[aa] - prices[ual]) std_30 = np.std(prices[aa] - prices[ual]) if std_30 > 0: mavg_1 = np.mean(short_prices[aa] - short_prices[ual]) zscore = (mavg_1-mavg_30)/std_30 if zscore > 1.0 and not context.short_spread: order_target_percent(aa, -0.5) order_target_percent(ual, 0.5) context.short_spread = True context.long_on_spread = False elif zscore < 1.0 and not context.long_on_spread: order_target_percent(aa, 0.5) order_target_percent(ual, -0.5) context.short_spread = False context.long_on_spread = True elif abs(zscore) < 0.1: order_target_percent(aa, 0) order_target_percent(ual, 0) context.long_on_spread = False context.short_spread = False record(zscore=zscore)
import numpy as np # Zipline from zipline.api import( symbol, order_target_percent, schedule_function, date_rules, time_rules, record, ) def initialize(context): context.stock = symbol("JNJ") schedule_function(check_stock_bands, date_rules.every_day()) def check_stock_bands(context, data): stock = context.stock current_price = data.current(stock, 'price') historical_prices = data.history(stock, 'price', 20, '1d') avg = historical_prices.mean() std = historical_prices.std() lower_band = avg - 2*std upper_band = avg + 2*std date = historical_prices.index[-1].strftime('%Y-%m-%d') if current_price <= lower_band: print(date, '- BUYING', stock, 'UPPER', upper_band, 'LOWER', lower_band, 'CURR', current_price, 'MAVG20', avg) order_target_percent(stock, 1.0) elif current_price >= upper_band: print(date, '- SELLING', stock, 'UPPER', upper_band, 'LOWER', lower_band, 'CURR', current_price, 'MAVG20', avg) order_target_percent(stock, 0) else: pass record(upper=upper_band, lower=lower_band, curr=current_price, mavg_20=avg)
Trading platforms pipelines are sued for algorithms that follow a structure (set of steps you need to perform in a certain order). Pipelines provide an API to implement the general structure of the algorithms which is:
Classifier is a function that transforms the input of an asset and a timestamp to a categorical output. e.x. taking AAPL and 2017 as inputs and returns it belongs to tech sector
A Factor is a function that takes in an asset and a timestamp and returns a numerical value such as the 20 day Moving Average
Filters take in an asset and timestamp and return a Boolean while Screens allows you to actually select rows where that filter is true and Masking allows you to ignore assets, the main differences with screens is that masks take place at the beginning and screens & filters take place at the end
Leverage is the ability to borrow money for use in investing, in trading it can be seen as reinvesting bet to gain a greater return on our investment
\text{Leverage Ratio} = (\text{Debt} + \text{Base Capital})/\text{Base Capital}
import os import datetime from scipy import stats import pandas as pd import numpy as np import pandas_datareader as web import matplotlib.pyplot as plt from statsmodels import regression import statsmodels.api as sm def alpha_beta(benchmark_daily_returns, stock_daily_returns): stock_values = stock_daily_returns.values benchmark_values = benchmark_daily_returns.values benchmark_const = sm.add_constant(benchmark_values) model = regression.linear_model.OLS(stock_values, benchmark_const).fit() alpha, beta = model.params return alpha, beta base_path = "." SYM = "AMZN" stock = pd.read_csv(os.path.join(base_path, "{}_2016.csv".format(SYM)), index_col='Date', parse_dates=True) spy_etf = df = pd.read_csv(os.path.join(base_path, "SPY_2016.csv"), sep=",", index_col='Date', parse_dates=True) stock["cumulative"] = stock["close"]/stock["close"].iloc[0] stock["daily return"] = (stock["close"].pct_change(1)) spy_etf["Cumulative"] = spy_etf["Close"]/spy_etf["Close"].iloc[0] spy_etf["Daily Return"] = (spy_etf["Close"].pct_change(1)) stock_daily_return = stock["daily return"].dropna() spy_etf_daily_return = spy_etf["Daily Return"].dropna() alpha, beta = alpha_beta(benchmark_daily_returns=spy_etf_daily_return, stock_daily_returns=stock_daily_return) print(alpha, beta) min_spy = spy_etf_daily_return.min() max_spy = spy_etf_daily_return.max() spy_line = np.linspace(min_spy, max_spy, 100) y = spy_line*beta + alpha plt.figure(figsize=(12, 6)) plt.plot(spy_line, y, 'r') plt.scatter(spy_etf["Daily Return"], stock["daily return"], alpha=0.6, s=50) plt.xlabel("SPY Ret") plt.ylabel("{} Ret".format(SYM)) hedged = -1*(beta*spy_etf_daily_return) + stock_daily_return alpha, beta = alpha_beta(spy_etf_daily_return, hedged) print("HEDGED", alpha, beta) print(hedged.mean(), hedged.std()) print(stock_daily_return.mean(), stock_daily_return.std()) plt.figure(figsize=(12, 6)) hedged.plot(label="{} with Hedge".format(SYM), alpha=0.9) stock_daily_return.plot(label=SYM, alpha=0.5) spy_etf_daily_return.plot(label="SPY ETF", alpha=0.5) plt.xlim(['2016-06-01', '2016-08-01']) plt.legend() plt.show()
Sentiment Analysis uses NLP (Natural Language Processing) to attempt to detect sentiment in some text
Forward Contracts are an agreement between two parties to pay a delivery price $K$ for some asset at some future time while the actual market price at time of that asset is $S_T$
derivatives are contracts between two or more entities, the value of the contract is based on an agreed-upon underlying financial asset (like a stock, market index, currency, etc)
Futures
If you find the information in this page useful and want to show your support, you can make a donation
Use PayPal
This will help me create more stuff and fix the existent content...