Compare commits

...

65 Commits

Author SHA1 Message Date
ValueRaider
fb5c67b3bd Bump version to 0.2.0rc1 - big update 2022-10-26 22:39:41 +01:00
ValueRaider
3f33aa0377 Merge pull request #1119 from ranaroussi/dev
Improve error handling
2022-10-26 16:23:59 +01:00
ValueRaider
ecdc36ab8e Merge pull request #1118 from fredrik-corneliusson/dev
Better handling of error from yahoo API, added missing pytz dependency and fixed if statement syntax warnings
2022-10-25 21:47:43 +01:00
Fredrik Corneliusson
fbc5de153a Handle error from yahoo api 2022-10-25 21:52:09 +02:00
Fredrik Corneliusson
e4a228b830 Some fixes and better debug if failing to fetch timezone from ticker. 2022-10-25 21:36:42 +02:00
Fredrik Corneliusson
3cee66dea7 Some fixes and better debug if failing to fetch timezone from ticker. 2022-10-25 21:22:45 +02:00
ValueRaider
bec5b38189 Merge pull request #1117 from ranaroussi/dev
Merge all dev updates into main
2022-10-25 18:13:13 +01:00
ValueRaider
f5973b2c89 Merge branch 'main' into dev 2022-10-25 17:49:02 +01:00
ValueRaider
edb911b913 Pre-emptive ambiguous DST fix 2022-10-25 17:42:44 +01:00
ValueRaider
6117b0a042 Fix syntax error 2022-10-25 16:56:32 +01:00
ValueRaider
5cb5484a9a Fix tests.ticker to use new cache API 2022-10-25 14:16:14 +01:00
ValueRaider
4e33ddf615 Merge pull request #1113 from fredrik-corneliusson/dev
Fix cache error on read only system #1108
2022-10-25 14:14:08 +01:00
ValueRaider
6d87f3d689 Fix PR merge 2022-10-25 14:10:23 +01:00
ValueRaider
b30b97fa36 Merge pull request #1116 from ranaroussi/fix/outlier-repair-bugfixes
Fix price repair ; Improve repair test
2022-10-25 14:05:20 +01:00
ValueRaider
6253e1d8a0 Merge pull request #1112 from ranaroussi/fix/get-tz-performance
Improve performance of fetching Ticker timezone
2022-10-25 14:03:04 +01:00
ValueRaider
2dce6a705c Remove debug code 2022-10-25 14:01:44 +01:00
ValueRaider
df11fcdb37 Improve Ticker._fetch_ticker_tz() ; Change timeout default to 10 2022-10-25 13:59:51 +01:00
fredrik-corneliusson
567e2cf0d3 Merge branch 'ranaroussi:dev' into dev 2022-10-25 01:00:47 +02:00
Fredrik Corneliusson
3d6e88857b Merge remote-tracking branch 'origin/dev' into dev 2022-10-25 01:00:13 +02:00
Fredrik Corneliusson
59af19d84c Fix cache error on read only system #1108 2022-10-25 00:59:05 +02:00
ValueRaider
e07191b627 Fix price repair ; Improve repair test 2022-10-24 23:55:16 +01:00
ValueRaider
2623ba967d Simplify Ticker._fetch_ticker_tz() - 2 2022-10-24 13:46:39 +01:00
ValueRaider
fe1c705e24 Simplify Ticker._fetch_ticker_tz() 2022-10-24 13:45:25 +01:00
ValueRaider
9315f7b61d Add Ticker._fetch_ticker_tz() for faster tz fetch 2022-10-24 13:34:57 +01:00
ValueRaider
f76c788881 Remove debug print 2022-10-24 11:21:56 +01:00
ValueRaider
561f56c9f9 Merge pull request #1110 from ranaroussi/feature/outlier-repair
Feature - repair 100x price errors
2022-10-24 00:16:25 +01:00
ValueRaider
cf795ea0c7 Merge pull request #1109 from fredrik-corneliusson/dev
Fix for #1076
2022-10-24 00:03:40 +01:00
ValueRaider
643536b53b Fix '_fix_unit_mixups()' when data missing split-adjustment 2022-10-23 23:46:33 +01:00
fredrik-corneliusson
ae8a5ff996 Merge branch 'ranaroussi:dev' into dev 2022-10-23 23:12:21 +02:00
Fredrik Corneliusson
d01d378c8d Small cleanup to ease finding bug #1076. Begun by getting rid of multiple calls to self.info (get_info). 2022-10-23 22:37:07 +02:00
ValueRaider
9e0152aae4 Merge pull request #1105 from fredrik-corneliusson/dev
Fix and improve timezone cache concurrency
2022-10-23 16:43:53 +01:00
Fredrik Corneliusson
6c21c1994e Fix bug, create cache directory if it does not exists. 2022-10-23 15:27:41 +02:00
Fredrik Corneliusson
d24a25f579 Add missing typehint 2022-10-23 13:59:48 +02:00
Fredrik Corneliusson
422a50672d Lazy init of cache db and added migration of data from old CSV cache. 2022-10-23 13:43:40 +02:00
ValueRaider
6e09410c7d Improve repair feedback msg 2022-10-23 00:03:23 +01:00
ValueRaider
3c51687351 Add arg history(repair=False) to fix $/cents £/p mixups 2022-10-22 23:58:20 +01:00
Fredrik Corneliusson
783df54978 Bugfix, do not set tz in cache if it is None, just delete it. 2022-10-22 23:56:50 +02:00
Fredrik Corneliusson
c76bf0128f Improve timezone cache to make it more reliable when using threads by using SQLLite. 2022-10-22 23:30:48 +02:00
ValueRaider
33f57ac002 Merge pull request #1104 from ranaroussi/feature/improve-err-msgs
Improve error message handling
2022-10-22 16:30:47 +01:00
ValueRaider
c0e1536179 Improve error message handling
Add error check for 'period' ; simplify err-msg handling ; new arg 'raise_errors' to control print-vs-Exception
2022-10-21 23:36:37 +01:00
ValueRaider
303e0ea655 Merge pull request #1102 from ranaroussi/fix/price-tz-and-events
Various fixes to price data
2022-10-21 22:19:11 +01:00
ValueRaider
40424b71a6 Fix test 'test_intraDayWithEvents' 2022-10-21 17:26:15 +01:00
ValueRaider
b018f917a9 Port in: 'Fix when Yahoo returns price=NaNs on dividend day' 2022-10-21 17:21:19 +01:00
ValueRaider
28e50946ca Fix Ticker.dividends property 2022-10-21 15:44:36 +01:00
ValueRaider
841b485b1d Drop out-of-date-range events 2022-10-21 15:37:51 +01:00
ValueRaider
e842a9d657 Event-merge fixes: intra-day, weely, lost tz, 'test_intraDayWithEvents' 2022-10-21 15:26:59 +01:00
ValueRaider
0f14728591 Add test 'test_tz_dst_ambiguous' 2022-10-21 15:26:45 +01:00
ValueRaider
69dfe325ae Add tz to daily price data 2022-10-21 12:54:48 +01:00
ValueRaider
f20aa9a875 Merge pull request #1099 from ranaroussi/feature/improve-tz-cache
Improve timezone cache
2022-10-21 10:31:01 +01:00
ValueRaider
5707c1aa65 Merge branch 'fix/download-timezones' into dev 2022-10-21 10:16:27 +01:00
ValueRaider
1e7f4a9a91 Strengthen tz-cache against bad/corrupt values - more 2022-10-20 22:09:37 +01:00
ValueRaider
37c36549e4 Add mutex to tz-cache update 2022-10-20 22:01:08 +01:00
ValueRaider
bda339b170 Strengthen tz-cache against bad/corrupt values 2022-10-20 21:59:20 +01:00
ValueRaider
f5995161ed Optimise TZ cache indexing 2022-10-20 21:54:58 +01:00
ValueRaider
4734e92090 Merge pull request #1070 from ranaroussi/fix/weekly-prices
Fix weekly/monthly prices across 2 rows
2022-10-14 23:18:59 +01:00
ValueRaider
5fdf2463e9 Merge branch 'dev' into fix/weekly-prices 2022-10-14 23:18:15 +01:00
ValueRaider
c679551faa Add unittest for duplication fix 2022-10-14 23:15:13 +01:00
ValueRaider
fdf52ac360 Merge pull request #1086 from ranaroussi/fix/events-merge
Fix merging pre-market events with min/hour prices
2022-10-14 14:08:35 +01:00
ValueRaider
94ad0bd955 Fix merging pre-market events with min/hour prices 2022-10-12 22:41:10 +01:00
ValueRaider
51c0ea0050 Enhance recent unittest 2022-10-10 15:37:55 +01:00
ValueRaider
3401d4dbe7 Merge pull request #1069 from ranaroussi/fix/events-merge
Fix merging of dividends/splits with prices
2022-10-10 14:01:31 +01:00
ValueRaider
a724585552 Tidy syntax 2022-10-10 14:00:10 +01:00
ValueRaider
1c85433cc0 Add unittest for div/splits merging 2022-10-10 13:58:17 +01:00
ValueRaider
5c0b2bbaa3 Fix weekly/monthly prices across 2 rows 2022-10-02 18:26:05 +01:00
ValueRaider
7d45a6709a Fix merging of dividends/splits with prices 2022-10-02 18:20:11 +01:00
13 changed files with 1236 additions and 123 deletions

View File

@@ -1,6 +1,16 @@
Change Log
===========
0.2.0rc1
------
Jumping to 0.2 for this big update. 0.1.* will continue to receive bug-fixes
- timezone cache performance massively improved. Thanks @fredrik-corneliusson #1113 #1112 #1109 #1105 #1099
- price repair feature #1110
- fix merging of dividends/splits with prices #1069 #1086 #1102
- fix Yahoo returning latest price interval across 2 rows #1070
- optional: raise errors as exceptions: raise_errors=True #1104
- add proper unit tests #1069
0.1.81
------
- Fix unhandled tz-cache exception #1107

View File

@@ -198,6 +198,9 @@ data = yf.download( # or pdr.get_data_yahoo(...
# (optional, default is False)
auto_adjust = True,
# identify and attempt repair of currency unit mixups e.g. $/cents
repair = False,
# download pre/post regular market hours data
# (optional, default is False)
prepost = True,

View File

@@ -4,3 +4,4 @@ requests>=2.26
multitasking>=0.0.7
lxml>=4.5.1
appdirs>=1.4.4
pytz>=2022.5

View File

@@ -63,7 +63,7 @@ setup(
packages=find_packages(exclude=['contrib', 'docs', 'tests', 'examples']),
install_requires=['pandas>=0.24.0', 'numpy>=1.15',
'requests>=2.26', 'multitasking>=0.0.7',
'lxml>=4.5.1', 'appdirs>=1.4.4'],
'lxml>=4.5.1', 'appdirs>=1.4.4', 'pytz>=2022.5'],
entry_points={
'console_scripts': [
'sample=sample:main',

1
tests/__init__.py Normal file
View File

@@ -0,0 +1 @@
#!/usr/bin/env python

9
tests/context.py Normal file
View File

@@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-
import sys
import os
_parent_dp = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
_src_dp = _parent_dp
sys.path.insert(0, _src_dp)
import yfinance

482
tests/prices.py Normal file
View File

@@ -0,0 +1,482 @@
from .context import yfinance as yf
import unittest
import datetime as _dt
import pytz as _tz
import numpy as _np
import pandas as _pd
# Create temp session
import requests_cache, tempfile
td = tempfile.TemporaryDirectory()
class TestPriceHistory(unittest.TestCase):
def setUp(self):
global td ; self.td = td
self.session = requests_cache.CachedSession(self.td.name+'/'+"yfinance.cache")
def tearDown(self):
self.session.close()
def test_daily_index(self):
tkrs = []
tkrs.append("BHP.AX")
tkrs.append("IMP.JO")
tkrs.append("BP.L")
tkrs.append("PNL.L")
tkrs.append("INTC")
intervals=["1d","1wk","1mo"]
for tkr in tkrs:
dat = yf.Ticker(tkr, session=self.session)
for interval in intervals:
df = dat.history(period="5y", interval=interval)
f = df.index.time==_dt.time(0)
self.assertTrue(f.all())
def test_duplicatingDaily(self):
tkrs = []
tkrs.append("IMP.JO")
tkrs.append("BHG.JO")
tkrs.append("SSW.JO")
tkrs.append("BP.L")
tkrs.append("INTC")
test_run = False
for tkr in tkrs:
dat = yf.Ticker(tkr, session=self.session)
tz = dat._get_ticker_tz(debug_mode=False, proxy=None, timeout=None)
dt_utc = _tz.timezone("UTC").localize(_dt.datetime.utcnow())
dt = dt_utc.astimezone(_tz.timezone(tz))
if dt.time() < _dt.time(17,0):
continue
test_run = True
df = dat.history(start=dt.date()-_dt.timedelta(days=7), interval="1d")
dt0 = df.index[-2]
dt1 = df.index[-1]
try:
self.assertNotEqual(dt0, dt1)
except:
print("Ticker = ", tkr)
raise
if not test_run:
self.skipTest("Skipping test_duplicatingDaily() because only expected to fail just after market close")
def test_duplicatingWeekly(self):
tkrs = ['MSFT', 'IWO', 'VFINX', '^GSPC', 'BTC-USD']
test_run = False
for tkr in tkrs:
dat = yf.Ticker(tkr, session=self.session)
tz = dat._get_ticker_tz(debug_mode=False, proxy=None, timeout=None)
dt = _tz.timezone(tz).localize(_dt.datetime.now())
if dt.date().weekday() not in [1,2,3,4]:
continue
test_run = True
df = dat.history(start=dt.date()-_dt.timedelta(days=7), interval="1wk")
dt0 = df.index[-2]
dt1 = df.index[-1]
try:
self.assertNotEqual(dt0.week, dt1.week)
except:
print("Ticker={}: Last two rows within same week:".format(tkr))
print(df.iloc[df.shape[0]-2:])
raise
if not test_run:
self.skipTest("Skipping test_duplicatingWeekly() because not possible to fail Monday/weekend")
def test_intraDayWithEvents(self):
# TASE dividend release pre-market, doesn't merge nicely with intra-day data so check still present
tkr = "ICL.TA"
# tkr = "ESLT.TA"
# tkr = "ONE.TA"
# tkr = "MGDL.TA"
start_d = _dt.date.today() - _dt.timedelta(days=60)
end_d = None
df_daily = yf.Ticker(tkr, session=self.session).history(start=start_d, end=end_d, interval="1d", actions=True)
df_daily_divs = df_daily["Dividends"][df_daily["Dividends"]!=0]
if df_daily_divs.shape[0]==0:
self.skipTest("Skipping test_intraDayWithEvents() because 'ICL.TA' has no dividend in last 60 days")
last_div_date = df_daily_divs.index[-1]
start_d = last_div_date.date()
end_d = last_div_date.date() + _dt.timedelta(days=1)
df = yf.Ticker(tkr, session=self.session).history(start=start_d, end=end_d, interval="15m", actions=True)
self.assertTrue((df["Dividends"]!=0.0).any())
def test_dailyWithEvents(self):
# Reproduce issue #521
tkr1 = "QQQ"
tkr2 = "GDX"
start_d = "2014-12-29"
end_d = "2020-11-29"
df1 = yf.Ticker(tkr1).history(start=start_d, end=end_d, interval="1d", actions=True)
df2 = yf.Ticker(tkr2).history(start=start_d, end=end_d, interval="1d", actions=True)
try:
self.assertTrue(df1.index.equals(df2.index))
except:
missing_from_df1 = df2.index.difference(df1.index)
missing_from_df2 = df1.index.difference(df2.index)
print("{} missing these dates: {}".format(tkr1, missing_from_df1))
print("{} missing these dates: {}".format(tkr2, missing_from_df2))
raise
# Test that index same with and without events:
tkrs = [tkr1, tkr2]
for tkr in tkrs:
df1 = yf.Ticker(tkr, session=self.session).history(start=start_d, end=end_d, interval="1d", actions=True)
df2 = yf.Ticker(tkr, session=self.session).history(start=start_d, end=end_d, interval="1d", actions=False)
try:
self.assertTrue(df1.index.equals(df2.index))
except:
missing_from_df1 = df2.index.difference(df1.index)
missing_from_df2 = df1.index.difference(df2.index)
print("{}-with-events missing these dates: {}".format(tkr, missing_from_df1))
print("{}-without-events missing these dates: {}".format(tkr, missing_from_df2))
raise
def test_weeklyWithEvents(self):
# Reproduce issue #521
tkr1 = "QQQ"
tkr2 = "GDX"
start_d = "2014-12-29"
end_d = "2020-11-29"
df1 = yf.Ticker(tkr1).history(start=start_d, end=end_d, interval="1wk", actions=True)
df2 = yf.Ticker(tkr2).history(start=start_d, end=end_d, interval="1wk", actions=True)
try:
self.assertTrue(df1.index.equals(df2.index))
except:
missing_from_df1 = df2.index.difference(df1.index)
missing_from_df2 = df1.index.difference(df2.index)
print("{} missing these dates: {}".format(tkr1, missing_from_df1))
print("{} missing these dates: {}".format(tkr2, missing_from_df2))
raise
# Test that index same with and without events:
tkrs = [tkr1, tkr2]
for tkr in tkrs:
df1 = yf.Ticker(tkr, session=self.session).history(start=start_d, end=end_d, interval="1wk", actions=True)
df2 = yf.Ticker(tkr, session=self.session).history(start=start_d, end=end_d, interval="1wk", actions=False)
try:
self.assertTrue(df1.index.equals(df2.index))
except:
missing_from_df1 = df2.index.difference(df1.index)
missing_from_df2 = df1.index.difference(df2.index)
print("{}-with-events missing these dates: {}".format(tkr, missing_from_df1))
print("{}-without-events missing these dates: {}".format(tkr, missing_from_df2))
raise
def test_monthlyWithEvents(self):
tkr1 = "QQQ"
tkr2 = "GDX"
start_d = "2014-12-29"
end_d = "2020-11-29"
df1 = yf.Ticker(tkr1).history(start=start_d, end=end_d, interval="1mo", actions=True)
df2 = yf.Ticker(tkr2).history(start=start_d, end=end_d, interval="1mo", actions=True)
try:
self.assertTrue(df1.index.equals(df2.index))
except:
missing_from_df1 = df2.index.difference(df1.index)
missing_from_df2 = df1.index.difference(df2.index)
print("{} missing these dates: {}".format(tkr1, missing_from_df1))
print("{} missing these dates: {}".format(tkr2, missing_from_df2))
raise
# Test that index same with and without events:
tkrs = [tkr1, tkr2]
for tkr in tkrs:
df1 = yf.Ticker(tkr, session=self.session).history(start=start_d, end=end_d, interval="1mo", actions=True)
df2 = yf.Ticker(tkr, session=self.session).history(start=start_d, end=end_d, interval="1mo", actions=False)
try:
self.assertTrue(df1.index.equals(df2.index))
except:
missing_from_df1 = df2.index.difference(df1.index)
missing_from_df2 = df1.index.difference(df2.index)
print("{}-with-events missing these dates: {}".format(tkr, missing_from_df1))
print("{}-without-events missing these dates: {}".format(tkr, missing_from_df2))
raise
def test_tz_dst_ambiguous(self):
# Reproduce issue #1100
try:
yf.Ticker("ESLT.TA", session=self.session).history(start="2002-10-06", end="2002-10-09", interval="1d")
except _tz.exceptions.AmbiguousTimeError:
raise Exception("Ambiguous DST issue not resolved")
def test_repair_weekly(self):
# Sometimes, Yahoo returns prices 100x the correct value.
# Suspect mixup between £/pence or $/cents etc.
# E.g. ticker PNL.L
# Setup:
tkr = "PNL.L"
error_threshold = 1000.0
start = "2020-01-06"
end = min(_dt.date.today(), _dt.date(2023,1,1))
# Run test
dat = yf.Ticker(tkr, session=self.session)
df_bad = dat.history(start=start, end=end, interval="1wk", auto_adjust=False, repair=False)
# Record the errors that will be repaired
data_cols = ["Low","High","Open","Close","Adj Close"]
f_outlier = _np.where(df_bad[data_cols]>error_threshold)
indices = None
if len(f_outlier[0])==0:
self.skipTest("Skipping test_repair_weekly() because no price 100x errors to repair")
indices = []
for i in range(len(f_outlier[0])):
indices.append((f_outlier[0][i], f_outlier[1][i]))
df = dat.history(start=start, end=end, interval="1wk", auto_adjust=False, repair=True)
# First test - no errors left after repair
df_data = df[data_cols].values
for i,j in indices:
try:
self.assertTrue(df_data[i,j] < error_threshold)
except:
print("Detected uncorrected error: idx={}, {}={}".format(df.index[i], data_cols[j], df_data[i,j]))
raise
# Second test - all differences between pre- and post-repair should be ~100x
ratio = (df_bad[data_cols].values/df[data_cols].values).round(2)
# - round near-100 ratios to 100:
f_near_100 = (ratio>90)&(ratio<110)
ratio[f_near_100] = (ratio[f_near_100]/10).round().astype(int)*10 # round ratio to nearest 10
# - now test
f_100 = ratio==100
f_1 = ratio==1
self.assertTrue((f_100|f_1).all())
# Third test: compare directly against daily data, unadjusted
df = dat.history(start=start, end=end, interval="1wk", auto_adjust=False, repair=True)
for i in indices:
dt = df.index[i[0]]
df_daily = dat.history(start=dt, end=dt+_dt.timedelta(days=7), interval="1d", auto_adjust=False, repair=True)
# Manually construct weekly price data from daily
df_yf_weekly = df_daily.copy()
df_yf_weekly["_weekStart"] = _pd.to_datetime(df_yf_weekly.index.tz_localize(None).to_period('W-SUN').start_time).tz_localize(df.index.tz)
df_yf_weekly.loc[df_yf_weekly["Stock Splits"]==0,"Stock Splits"]=1
df_yf_weekly = df_yf_weekly.groupby("_weekStart").agg(
Open=("Open", "first"),
Close=("Close", "last"),
AdjClose=("Adj Close", "last"),
Low=("Low", "min"),
High=("High", "max"),
Volume=("Volume", "sum"),
Dividends=("Dividends", "sum"),
StockSplits=("Stock Splits", "prod")).rename(columns={"StockSplits":"Stock Splits","AdjClose":"Adj Close"})
df_yf_weekly.loc[df_yf_weekly["Stock Splits"]==1,"Stock Splits"]=0
if df_yf_weekly.index[0] not in df_daily.index:
# Exchange closed Monday. In this case, Yahoo sets Open to last week close
df_daily_last_week = dat.history(start=dt-_dt.timedelta(days=7), end=dt, interval="1d", auto_adjust=False, repair=True)
df_yf_weekly["Open"] = df_daily_last_week["Close"][-1]
df_yf_weekly["Low"] = _np.minimum(df_yf_weekly["Low"], df_yf_weekly["Open"])
# Compare fetched-weekly vs constructed-weekly:
df_yf_weekly = df_yf_weekly[df.columns]
try:
# Note: Adj Close has tiny variance depending on date range requested
data_cols = ["Open","Close","Low","High"]
self.assertTrue(_np.equal(df.loc[dt,data_cols].values, df_yf_weekly[data_cols].iloc[0].values).all())
self.assertLess(abs(df.loc[dt,"Adj Close"]/df_yf_weekly["Adj Close"].iloc[0] -1.0), 0.000001)
except:
for c in df.columns:
if c=="Adj Close":
fail = abs(df.loc[dt,c]/df_yf_weekly[c].iloc[0] -1.0) < 0.000001
else:
fail = df.loc[dt,c] != df_yf_weekly[c].iloc[0]
if fail:
print("dt = ",dt)
print("df.loc[dt]:", type(df.loc[dt]))
print(df.loc[dt].to_dict())
print("df_yf_weekly.iloc[0]:", type(df_yf_weekly.iloc[0]))
print(df_yf_weekly.iloc[0].to_dict())
print("Result:", df.loc[dt,c])
print("Answer:", df_yf_weekly[c].iloc[0])
raise Exception("Mismatch in column '{}'".format(c))
def test_repair_weekly2_preSplit(self):
# Sometimes, Yahoo returns prices 100x the correct value.
# Suspect mixup between £/pence or $/cents etc.
# E.g. ticker PNL.L
# PNL.L has a stock-split in 2022. Sometimes requesting data before 2022 is not split-adjusted.
# Setup:
tkr = "PNL.L"
error_threshold = 1000.0
start = "2020-01-06"
end = "2021-06-01"
# Run test
dat = yf.Ticker(tkr, session=self.session)
df_bad = dat.history(start=start, end=end, interval="1wk", auto_adjust=False, repair=False)
# Record the errors that will be repaired
data_cols = ["Low","High","Open","Close","Adj Close"]
f_outlier = _np.where(df_bad[data_cols]>error_threshold)
indices = None
if len(f_outlier[0])==0:
self.skipTest("Skipping test_repair_weekly() because no price 100x errors to repair")
indices = []
for i in range(len(f_outlier[0])):
indices.append((f_outlier[0][i], f_outlier[1][i]))
df = dat.history(start=start, end=end, interval="1wk", auto_adjust=False, repair=True)
# First test - no errors left after repair
df_data = df[data_cols].values
for i,j in indices:
try:
self.assertTrue(df_data[i,j] < error_threshold)
except:
print("Detected uncorrected error: idx={}, {}={}".format(df.index[i], data_cols[j], df_data[i,j]))
raise
# Second test - all differences between pre- and post-repair should be ~100x
ratio = (df_bad[data_cols].values/df[data_cols].values).round(2)
# - round near-100 ratios to 100:
f_near_100 = (ratio>90)&(ratio<110)
ratio[f_near_100] = (ratio[f_near_100]/10).round().astype(int)*10 # round ratio to nearest 10
# - now test
f_100 = ratio==100
f_1 = ratio==1
self.assertTrue((f_100|f_1).all())
# Third test: compare directly against daily data, unadjusted
df = dat.history(start=start, end=end, interval="1wk", auto_adjust=False, repair=True)
for i in indices:
dt = df.index[i[0]]
df_daily = dat.history(start=dt, end=dt+_dt.timedelta(days=7), interval="1d", auto_adjust=False, repair=True)
# Manually construct weekly price data from daily
df_yf_weekly = df_daily.copy()
df_yf_weekly["_weekStart"] = _pd.to_datetime(df_yf_weekly.index.tz_localize(None).to_period('W-SUN').start_time).tz_localize(df.index.tz)
df_yf_weekly.loc[df_yf_weekly["Stock Splits"]==0,"Stock Splits"]=1
df_yf_weekly = df_yf_weekly.groupby("_weekStart").agg(
Open=("Open", "first"),
Close=("Close", "last"),
AdjClose=("Adj Close", "last"),
Low=("Low", "min"),
High=("High", "max"),
Volume=("Volume", "sum"),
Dividends=("Dividends", "sum"),
StockSplits=("Stock Splits", "prod")).rename(columns={"StockSplits":"Stock Splits","AdjClose":"Adj Close"})
df_yf_weekly.loc[df_yf_weekly["Stock Splits"]==1,"Stock Splits"]=0
if df_yf_weekly.index[0] not in df_daily.index:
# Exchange closed Monday. In this case, Yahoo sets Open to last week close
df_daily_last_week = dat.history(start=dt-_dt.timedelta(days=7), end=dt, interval="1d", auto_adjust=False, repair=True)
df_yf_weekly["Open"] = df_daily_last_week["Close"][-1]
df_yf_weekly["Low"] = _np.minimum(df_yf_weekly["Low"], df_yf_weekly["Open"])
# Compare fetched-weekly vs constructed-weekly:
df_yf_weekly = df_yf_weekly[df.columns]
try:
# Note: Adj Close has tiny variance depending on date range requested
data_cols = ["Open","Close","Low","High"]
self.assertTrue(_np.equal(df.loc[dt,data_cols].values, df_yf_weekly[data_cols].iloc[0].values).all())
self.assertLess(abs(df.loc[dt,"Adj Close"]/df_yf_weekly["Adj Close"].iloc[0] -1.0), 0.000001)
except:
for c in df.columns:
if c=="Adj Close":
fail = abs(df.loc[dt,c]/df_yf_weekly[c].iloc[0] -1.0) < 0.000001
else:
fail = df.loc[dt,c] != df_yf_weekly[c].iloc[0]
if fail:
print("dt = ",dt)
print("df.loc[dt]:", type(df.loc[dt]))
print(df.loc[dt].to_dict())
print("df_yf_weekly.iloc[0]:", type(df_yf_weekly.iloc[0]))
print(df_yf_weekly.iloc[0].to_dict())
print("Result:", df.loc[dt,c])
print("Answer:", df_yf_weekly[c].iloc[0])
raise Exception("Mismatch in column '{}'".format(c))
def test_repair_daily(self):
# Sometimes, Yahoo returns prices 100x the correct value.
# Suspect mixup between £/pence or $/cents etc.
# E.g. ticker PNL.L
tkr = "PNL.L"
start = "2020-01-01"
end = min(_dt.date.today(), _dt.date(2023,1,1))
dat = yf.Ticker(tkr, session=self.session)
data_cols = ["Low","High","Open","Close","Adj Close"]
df_bad = dat.history(start=start, end=end, interval="1d", auto_adjust=False, repair=False)
f_outlier = _np.where(df_bad[data_cols]>1000.0)
indices = None
if len(f_outlier[0])==0:
self.skipTest("Skipping test_repair_daily() because no price 100x errors to repair")
# Outliers detected
indices = []
for i in range(len(f_outlier[0])):
indices.append((f_outlier[0][i], f_outlier[1][i]))
df = dat.history(start=start, end=end, interval="1d", auto_adjust=False, repair=True)
# First test - no errors left
df_data = df[data_cols].values
for i,j in indices:
try:
self.assertTrue(df_data[i,j] < 1000.0)
except:
print("Detected uncorrected error: idx={}, {}={}".format(df.index[i], data_cols[j], df_data[i,j]))
# print(df.iloc[i-1:i+2])
raise
# Second test - all differences should be either ~1x or ~100x
ratio = df_bad[data_cols].values/df[data_cols].values
ratio = ratio.round(2)
# - round near-100 ratio to 100:
f = ratio>90
ratio[f] = (ratio[f]/10).round().astype(int)*10 # round ratio to nearest 10
# - now test
f_100 = ratio==100
f_1 = ratio==1
self.assertTrue((f_100|f_1).all())
if __name__ == '__main__':
unittest.main()
# # Run tests sequentially:
# import inspect
# test_src = inspect.getsource(TestPriceHistory)
# unittest.TestLoader.sortTestMethodsUsing = lambda _, x, y: (
# test_src.index(f"def {x}") - test_src.index(f"def {y}")
# )
# unittest.main(verbosity=2)
td.cleanup()

33
tests/ticker.py Normal file
View File

@@ -0,0 +1,33 @@
from .context import yfinance as yf
import unittest
class TestTicker(unittest.TestCase):
def setUp(self):
pass
def tearDown(self):
pass
def test_getTz(self):
tkrs = []
tkrs.append("IMP.JO")
tkrs.append("BHG.JO")
tkrs.append("SSW.JO")
tkrs.append("BP.L")
tkrs.append("INTC")
test_run = False
for tkr in tkrs:
# First step: remove ticker from tz-cache
yf.utils.get_tz_cache().store(tkr, None)
# Test:
dat = yf.Ticker(tkr)
tz = dat._get_ticker_tz(debug_mode=False, proxy=None, timeout=None)
self.assertIsNotNone(tz)
if __name__ == '__main__':
unittest.main()

View File

@@ -29,6 +29,8 @@ import pandas as _pd
import numpy as _np
import re as _re
from pytz import UnknownTimeZoneError
try:
from urllib.parse import quote as urlencode
except ImportError:
@@ -100,8 +102,8 @@ class TickerBase():
def history(self, period="1mo", interval="1d",
start=None, end=None, prepost=False, actions=True,
auto_adjust=True, back_adjust=False, keepna=False,
proxy=None, rounding=False, timeout=None, **kwargs):
auto_adjust=True, back_adjust=False, repair=False, keepna=False,
proxy=None, rounding=False, timeout=10, **kwargs):
"""
:Parameters:
period : str
@@ -123,6 +125,9 @@ class TickerBase():
Adjust all OHLC automatically? Default is True
back_adjust: bool
Back-adjusted data to mimic true historical prices
repair: bool
Detect currency unit 100x mixups and attempt repair
Default is False
keepna: bool
Keep NaN rows returned by Yahoo?
Default is False
@@ -134,33 +139,38 @@ class TickerBase():
timeout: None or float
If not None stops waiting for a response after given number of
seconds. (Can also be a fraction of a second e.g. 0.01)
Default is None.
Default is 10 seconds.
**kwargs: dict
debug: bool
Optional. If passed as False, will suppress
error message printing to console.
raise_errors: bool
Optional. If True, then raise errors as
exceptions instead of printing to console.
"""
# Work with errors
debug_mode = True
if "debug" in kwargs and isinstance(kwargs["debug"], bool):
debug_mode = kwargs["debug"]
err_msg = "No data found for this date range, symbol may be delisted"
raise_errors = False
if "raise_errors" in kwargs and isinstance(kwargs["raise_errors"], bool):
raise_errors = kwargs["raise_errors"]
if start or period is None or period.lower() == "max":
# Check can get TZ. Fail => probably delisted
try:
tz = self._get_ticker_tz()
except KeyError as e:
if "exchangeTimezoneName" in str(e):
shared._DFS[self.ticker] = utils.empty_df()
shared._ERRORS[self.ticker] = err_msg
if "many" not in kwargs and debug_mode:
tz = self._get_ticker_tz(debug_mode, proxy, timeout)
if tz is None:
# Every valid ticker has a timezone. Missing = problem
err_msg = "No timezone found, symbol certainly delisted"
shared._DFS[self.ticker] = utils.empty_df()
shared._ERRORS[self.ticker] = err_msg
if "many" not in kwargs and debug_mode:
if raise_errors:
raise Exception('%s: %s' % (self.ticker, err_msg))
else:
print('- %s: %s' % (self.ticker, err_msg))
return utils.empty_df()
else:
raise
return utils.empty_df()
if end is None:
end = int(_time.time())
@@ -215,28 +225,31 @@ class TickerBase():
except Exception:
pass
if data is None or not type(data) is dict or 'status_code' in data.keys():
shared._DFS[self.ticker] = utils.empty_df()
shared._ERRORS[self.ticker] = err_msg
if "many" not in kwargs and debug_mode:
print('- %s: %s' % (self.ticker, err_msg))
return utils.empty_df()
if "chart" in data and data["chart"]["error"]:
err_msg = "No data found for this date range, symbol may be delisted"
fail = False
if data is None or not type(data) is dict:
fail = True
elif type(data) is dict and 'status_code' in data.keys():
err_msg += "(Yahoo status_code = {})".format(data["status_code"])
fail = True
elif "chart" in data and data["chart"]["error"]:
err_msg = data["chart"]["error"]["description"]
fail = True
elif not "chart" in data or data["chart"]["result"] is None or not data["chart"]["result"]:
fail = True
elif not period is None and not "timestamp" in data["chart"]["result"][0] and not period in data["chart"]["result"][0]["meta"]["validRanges"]:
# User provided a bad period. The minimum should be '1d', but sometimes Yahoo accepts '1h'.
err_msg = "Period '{}' is invalid, must be one of {}".format(period, data["chart"]["result"][0]["meta"]["validRanges"])
fail = True
if fail:
shared._DFS[self.ticker] = utils.empty_df()
shared._ERRORS[self.ticker] = err_msg
if "many" not in kwargs and debug_mode:
print('- %s: %s' % (self.ticker, err_msg))
return shared._DFS[self.ticker]
elif "chart" not in data or data["chart"]["result"] is None or \
not data["chart"]["result"]:
shared._DFS[self.ticker] = utils.empty_df()
shared._ERRORS[self.ticker] = err_msg
if "many" not in kwargs and debug_mode:
print('- %s: %s' % (self.ticker, err_msg))
return shared._DFS[self.ticker]
if raise_errors:
raise Exception('%s: %s' % (self.ticker, err_msg))
else:
print('%s: %s' % (self.ticker, err_msg))
return utils.empty_df()
# parse quotes
try:
@@ -250,7 +263,10 @@ class TickerBase():
shared._DFS[self.ticker] = utils.empty_df()
shared._ERRORS[self.ticker] = err_msg
if "many" not in kwargs and debug_mode:
print('- %s: %s' % (self.ticker, err_msg))
if raise_errors:
raise Exception('%s: %s' % (self.ticker, err_msg))
else:
print('%s: %s' % (self.ticker, err_msg))
return shared._DFS[self.ticker]
# 2) fix weired bug with Yahoo! - returning 60m for 30m bars
@@ -273,6 +289,17 @@ class TickerBase():
except Exception:
pass
tz_exchange = data["chart"]["result"][0]["meta"]["exchangeTimezoneName"]
# Note: ordering is important. If you change order, run the tests!
quotes = utils.fix_Yahoo_returning_live_separate(quotes, params["interval"], tz_exchange)
quotes = utils.set_df_tz(quotes, params["interval"], tz_exchange)
quotes = utils.fix_Yahoo_dst_issue(quotes, params["interval"])
if repair:
# Do this before auto/back adjust
quotes = self._fix_unit_mixups(quotes, interval, tz_exchange)
# Auto/back adjust
try:
if auto_adjust:
quotes = utils.auto_adjust(quotes)
@@ -286,7 +313,10 @@ class TickerBase():
shared._DFS[self.ticker] = utils.empty_df()
shared._ERRORS[self.ticker] = err_msg
if "many" not in kwargs and debug_mode:
print('- %s: %s' % (self.ticker, err_msg))
if raise_errors:
raise Exception('%s: %s' % (self.ticker, err_msg))
else:
print('%s: %s' % (self.ticker, err_msg))
if rounding:
quotes = _np.round(quotes, data[
@@ -295,27 +325,41 @@ class TickerBase():
# actions
dividends, splits = utils.parse_actions(data["chart"]["result"][0])
if start is not None:
startDt = _pd.to_datetime(_datetime.datetime.utcfromtimestamp(start))
if dividends is not None:
dividends = dividends[dividends.index>=startDt]
if splits is not None:
splits = splits[splits.index>=startDt]
if end is not None:
endDt = _pd.to_datetime(_datetime.datetime.utcfromtimestamp(end))
if dividends is not None:
dividends = dividends[dividends.index<endDt]
if splits is not None:
splits = splits[splits.index<endDt]
if splits is not None:
splits = utils.set_df_tz(splits, interval, tz_exchange)
if dividends is not None:
dividends = utils.set_df_tz(dividends, interval, tz_exchange)
tz_exchange = data["chart"]["result"][0]["meta"]["exchangeTimezoneName"]
# combine
df = _pd.concat([quotes, dividends, splits], axis=1, sort=True)
df["Dividends"].fillna(0, inplace=True)
df["Stock Splits"].fillna(0, inplace=True)
# index eod/intraday
df.index = df.index.tz_localize("UTC").tz_convert(tz_exchange)
df = utils.fix_Yahoo_dst_issue(df, params["interval"])
if params["interval"][-1] == "m":
df.index.name = "Datetime"
elif params["interval"] == "1h":
pass
# Combine
df = quotes.sort_index()
if dividends.shape[0] > 0:
df = utils.safe_merge_dfs(df, dividends, interval)
if "Dividends" in df.columns:
df.loc[df["Dividends"].isna(),"Dividends"] = 0
else:
df["Dividends"] = 0.0
if splits.shape[0] > 0:
df = utils.safe_merge_dfs(df, splits, interval)
if "Stock Splits" in df.columns:
df.loc[df["Stock Splits"].isna(),"Stock Splits"] = 0
else:
df["Stock Splits"] = 0.0
if params["interval"][-1] in ("m",'h'):
df.index.name = "Datetime"
else:
# If a midnight is during DST transition hour when clocks roll back,
# meaning clock hits midnight twice, then use the 2nd (ambiguous=True)
df.index = _pd.to_datetime(df.index.date).tz_localize(tz_exchange, ambiguous=True)
df.index.name = "Date"
# duplicates and missing rows cleanup
@@ -331,22 +375,229 @@ class TickerBase():
# ------------------------
def _get_ticker_tz(self):
if not self._tz is None:
def _fix_unit_mixups(self, df, interval, tz_exchange):
# Sometimes Yahoo returns few prices in cents/pence instead of $/£
# I.e. 100x bigger
# Easy to detect and fix, just look for outliers = ~100x local median
if df.shape[0] == 0:
return df
if df.shape[0] == 1:
# Need multiple rows to confidently identify outliers
return df
if df.index.tz is None:
df.index = df.index.tz_localize(tz_exchange)
else:
df.index = df.index.tz_convert(tz_exchange)
# Only import scipy if users actually want function. To avoid
# adding it to dependencies.
from scipy import ndimage as _ndimage
data_cols = ["Open","High","Low","Close"]
data_cols = [c for c in data_cols if c in df.columns]
n = df.shape[0]
median = _ndimage.median_filter(df[data_cols].values, size=(3,3), mode='mirror')
if (median==0).any():
raise Exception("median contains zeroes, why?")
ratio = df[data_cols].values/median
# ratio_rounded = (ratio/5).round()*5 # round ratio to nearest 5
ratio_rounded = (ratio/10).round()*10 # round ratio to nearest 10
f = (ratio_rounded)==100
# Store each mixup:
mixups = {}
for j in range(len(data_cols)):
fj = f[:,j]
if fj.any():
dc = data_cols[j]
for i in _np.where(fj)[0]:
idx = df.index[i]
if idx not in mixups:
mixups[idx] = {"data":df.loc[idx,data_cols], "fields":set([dc])}
else:
mixups[idx]["fields"].add(dc)
n_mixups = len(mixups)
if len(mixups) > 0:
# Problem with Yahoo's mixup is they calculate high & low after, so they can be corrupted.
# If interval is weekly then can correct with daily. But if smaller intervals then
# restricted to recent times:
# - daily = hourly restricted to last 730 days
sub_interval = None
td_range = None
if interval == "1wk":
# Correct by fetching week of daily data
sub_interval = "1d"
td_range = _datetime.timedelta(days=7)
elif interval == "1d":
# Correct by fetching day of hourly data
sub_interval = "1h"
td_range = _datetime.timedelta(days=1)
else:
print("WARNING: Have not implemented repair for '{}' interval. Contact developers".format(interval))
return df
# This first pass will correct all errors in Open/Close/Adj Close columns.
# It will also *attempt* to correct Low/High columns, but only if can get price data.
for idx in sorted(list(mixups.keys())):
m = mixups[idx]
# Although only some fields in row exhibit 100x error, normally the other fields are also corrupted,
# so need to recalculate all fields in row.
if td_range is None:
raise Exception("was hoping this wouldn't happen")
start = idx.date()
if sub_interval=="1h" and (_datetime.date.today()-start) > _datetime.timedelta(days=729):
# Don't bother requesting more price data, Yahoo will reject
pass
else:
if sub_interval=="1h":
df_fine = self.history(start=idx.date(), end=idx.date()+td_range, interval=sub_interval, auto_adjust=False)
else:
df_fine = self.history(start=idx.date()-td_range, end=idx.date()+td_range, interval=sub_interval, auto_adjust=False)
# First, check whether df_fine has different split-adjustment than df.
# If it is different, then adjust df_fine to match df
good_fields = list(set(data_cols)-m["fields"])
median = df.loc[idx,good_fields].median()
median_fine = _np.median(df_fine[good_fields].values)
ratio = round(median/median_fine, 1)
ratio_rcp = round(median_fine/median, 1)
if ratio==1 and ratio_rcp==1:
# Good!
pass
else:
if ratio>1:
# data has different split-adjustment than fine-grained data
# Adjust fine-grained to match
df_fine[data_cols] *= ratio
elif ratio_rcp>1:
# data has different split-adjustment than fine-grained data
# Adjust fine-grained to match
df_fine[data_cols] *= 1.0/ratio_rcp
median_fine = _np.median(df_fine[good_fields].values)
ratio = round(median/median_fine, 1)
ratio_rcp = round(median_fine/median, 1)
if sub_interval != "1h":
# dt_before_week = df_fine.index[df_fine.index.get_loc(idx)-1]
df_last_week = df_fine[df_fine.index<idx]
df_fine = df_fine[df_fine.index>=idx]
if "High" in m["fields"]:
df.loc[idx, "High"] = df_fine["High"].max()
m["fields"].remove("High")
if "Low" in m["fields"]:
df.loc[idx, "Low"] = df_fine["Low"].min()
m["fields"].remove("Low")
if "Open" in m["fields"]:
if sub_interval != "1h" and idx != df_fine.index[0]:
# Exchange closed Monday. In this case, Yahoo sets Open to last week close
df.loc[idx, "Open"] = df_last_week["Close"][-1]
df.loc[idx, "Low"] = min(df.loc[idx, "Open"], df.loc[idx, "Low"])
else:
df.loc[idx, "Open"] = df_fine["Open"].iloc[0]
m["fields"].remove("Open")
if "Close" in m["fields"]:
df.loc[idx, "Close"] = df_fine["Close"].iloc[-1]
m["fields"].remove("Close")
# Assume 'Adj Close' also corrupted, easier than detecting whether true
df.loc[idx, "Adj Close"] = df_fine["Adj Close"].iloc[-1]
if len(m["fields"])==0:
del mixups[idx]
# This second pass will *crudely* "fix" any remaining errors in High/Low
# simply by ensuring they don't contradict e.g. Low = 100x High
if len(mixups)>0:
for idx in sorted(list(mixups.keys())):
m = mixups[idx]
row = df.loc[idx,["Open","Close"]]
if "High" in m["fields"]:
df.loc[idx,"High"] = row.max()
m["fields"].remove("High")
if "Low" in m["fields"]:
df.loc[idx,"Low"] = row.min()
m["fields"].remove("Low")
if len(m["fields"])==0:
del mixups[idx]
n_fixed = n_mixups - len(mixups)
print("{}: fixed {} currency unit mixups in {} price data".format(self.ticker, n_fixed, interval))
if len(mixups)>0:
print(" ... and failed to correct {}".format(len(mixups)))
return df
def _get_ticker_tz(self, debug_mode, proxy, timeout):
if self._tz is not None:
return self._tz
cache = utils.get_tz_cache()
tz = cache.lookup(self.ticker)
tkr_tz = utils.cache_lookup_tkr_tz(self.ticker)
if tkr_tz is None:
tkr_tz = self.info["exchangeTimezoneName"]
# info fetch is relatively slow so cache timezone
try:
utils.cache_store_tkr_tz(self.ticker, tkr_tz)
except PermissionError:
# System probably read-only, so cannot cache
pass
if tz and not utils.is_valid_timezone(tz):
# Clear from cache and force re-fetch
cache.store(self.ticker, None)
tz = None
self._tz = tkr_tz
return tkr_tz
if tz is None:
tz = self._fetch_ticker_tz(debug_mode, proxy, timeout)
if utils.is_valid_timezone(tz):
# info fetch is relatively slow so cache timezone
cache.store(self.ticker, tz)
else:
tz = None
self._tz = tz
return tz
def _fetch_ticker_tz(self, debug_mode, proxy, timeout):
# Query Yahoo for basic price data just to get returned timezone
params = {"range":"1d", "interval":"1d"}
# setup proxy in requests format
if proxy is not None:
if isinstance(proxy, dict) and "https" in proxy:
proxy = proxy["https"]
proxy = {"https": proxy}
# Getting data from json
url = "{}/v8/finance/chart/{}".format(self._base_url, self.ticker)
session = self.session or _requests
try:
data = session.get(url=url, params=params, proxies=proxy, headers=utils.user_agent_headers, timeout=timeout)
data = data.json()
except Exception as e:
if debug_mode:
print("Failed to get ticker '{}' reason: {}".format(self.ticker, e))
return None
else:
error = data.get('chart', {}).get('error', None)
if error:
# explicit error from yahoo API
if debug_mode:
print("Got error from yahoo api for ticker {}, Error: {}".format(self.ticker, error))
else:
try:
return data["chart"]["result"][0]["meta"]["exchangeTimezoneName"]
except Exception as err:
if debug_mode:
print("Could not get exchangeTimezoneName for ticker '{}' reason: {}".format(self.ticker, err))
print("Got response: ")
print("-------------")
print(" {}".format(data))
print("-------------")
return None
def _get_info(self, proxy=None):
# setup proxy in requests format
@@ -355,10 +606,8 @@ class TickerBase():
proxy = proxy["https"]
proxy = {"https": proxy}
if (self._info is None) or (self._sustainability is None) or (self._recommendations is None):
## Need to fetch
pass
else:
if (self._info is not None) or (self._sustainability is not None) or (self._recommendations):
# No need to fetch
return
ticker_url = "{}/{}".format(self._scrape_url, self.ticker)
@@ -929,7 +1178,7 @@ class TickerBase():
dates[cn] = _pd.to_datetime(dates[cn], format="%b %d, %Y, %I %p")
# - instead of attempting decoding of ambiguous timezone abbreviation, just use 'info':
dates[cn] = dates[cn].dt.tz_localize(
tz=self.info["exchangeTimezoneName"])
tz=self.get_info()["exchangeTimezoneName"])
dates = dates.set_index("Earnings Date")

View File

@@ -30,9 +30,9 @@ from . import shared
def download(tickers, start=None, end=None, actions=False, threads=True, ignore_tz=True,
group_by='column', auto_adjust=False, back_adjust=False, keepna=False,
group_by='column', auto_adjust=False, back_adjust=False, repair=False, keepna=False,
progress=True, period="max", show_errors=True, interval="1d", prepost=False,
proxy=None, rounding=False, timeout=None, **kwargs):
proxy=None, rounding=False, timeout=10, **kwargs):
"""Download yahoo tickers
:Parameters:
tickers : str, list
@@ -56,6 +56,9 @@ def download(tickers, start=None, end=None, actions=False, threads=True, ignore_
Default is False
auto_adjust: bool
Adjust all OHLC automatically? Default is False
repair: bool
Detect currency unit 100x mixups and attempt repair
Default is False
keepna: bool
Keep NaN rows returned by Yahoo?
Default is False
@@ -111,7 +114,7 @@ def download(tickers, start=None, end=None, actions=False, threads=True, ignore_
_download_one_threaded(ticker, period=period, interval=interval,
start=start, end=end, prepost=prepost,
actions=actions, auto_adjust=auto_adjust,
back_adjust=back_adjust, keepna=keepna,
back_adjust=back_adjust, repair=repair, keepna=keepna,
progress=(progress and i > 0), proxy=proxy,
rounding=rounding, timeout=timeout)
while len(shared._DFS) < len(tickers):
@@ -123,7 +126,8 @@ def download(tickers, start=None, end=None, actions=False, threads=True, ignore_
data = _download_one(ticker, period=period, interval=interval,
start=start, end=end, prepost=prepost,
actions=actions, auto_adjust=auto_adjust,
back_adjust=back_adjust, keepna=keepna, proxy=proxy,
back_adjust=back_adjust, repair=repair, keepna=keepna,
proxy=proxy,
rounding=rounding, timeout=timeout)
shared._DFS[ticker.upper()] = data
if progress:
@@ -191,12 +195,12 @@ def _realign_dfs():
@_multitasking.task
def _download_one_threaded(ticker, start=None, end=None,
auto_adjust=False, back_adjust=False,
auto_adjust=False, back_adjust=False, repair=False,
actions=False, progress=True, period="max",
interval="1d", prepost=False, proxy=None,
keepna=False, rounding=False, timeout=None):
keepna=False, rounding=False, timeout=10):
data = _download_one(ticker, start, end, auto_adjust, back_adjust,
data = _download_one(ticker, start, end, auto_adjust, back_adjust, repair,
actions, period, interval, prepost, proxy, rounding,
keepna, timeout)
shared._DFS[ticker.upper()] = data
@@ -205,14 +209,14 @@ def _download_one_threaded(ticker, start=None, end=None,
def _download_one(ticker, start=None, end=None,
auto_adjust=False, back_adjust=False,
auto_adjust=False, back_adjust=False, repair=False,
actions=False, period="max", interval="1d",
prepost=False, proxy=None, rounding=False,
keepna=False, timeout=None):
keepna=False, timeout=10):
return Ticker(ticker).history(period=period, interval=interval,
start=start, end=end, prepost=prepost,
actions=actions, auto_adjust=auto_adjust,
back_adjust=back_adjust, proxy=proxy,
back_adjust=back_adjust, repair=repair, proxy=proxy,
rounding=rounding, keepna=keepna, many=True,
timeout=timeout)

View File

@@ -46,27 +46,31 @@ class Tickers():
def history(self, period="1mo", interval="1d",
start=None, end=None, prepost=False,
actions=True, auto_adjust=True, proxy=None,
actions=True, auto_adjust=True, repair=False,
proxy=None,
threads=True, group_by='column', progress=True,
timeout=None, **kwargs):
timeout=10, **kwargs):
return self.download(
period, interval,
start, end, prepost,
actions, auto_adjust, proxy,
actions, auto_adjust, repair,
proxy,
threads, group_by, progress,
timeout, **kwargs)
def download(self, period="1mo", interval="1d",
start=None, end=None, prepost=False,
actions=True, auto_adjust=True, proxy=None,
actions=True, auto_adjust=True, repair=False,
proxy=None,
threads=True, group_by='column', progress=True,
timeout=None, **kwargs):
timeout=10, **kwargs):
data = multi.download(self.symbols,
start=start, end=end,
actions=actions,
auto_adjust=auto_adjust,
repair=repair,
period=period,
interval=interval,
prepost=prepost,

View File

@@ -22,6 +22,8 @@
from __future__ import print_function
import datetime as _datetime
from typing import Dict, Union
import pytz as _tz
import requests as _requests
import re as _re
@@ -30,6 +32,12 @@ import numpy as _np
import sys as _sys
import os as _os
import appdirs as _ad
import sqlite3 as _sqlite3
import atexit as _atexit
from threading import Lock
from pytz import UnknownTimeZoneError
try:
import ujson as _json
@@ -85,7 +93,9 @@ def get_news_by_isin(isin, proxy=None, session=None):
return data.get('news', {})
def empty_df(index=[]):
def empty_df(index=None):
if index is None:
index = []
empty = _pd.DataFrame(index=index, data={
'Open': _np.nan, 'High': _np.nan, 'Low': _np.nan,
'Close': _np.nan, 'Adj Close': _np.nan, 'Volume': _np.nan})
@@ -247,11 +257,197 @@ def parse_actions(data):
splits.sort_index(inplace=True)
splits["Stock Splits"] = splits["numerator"] / \
splits["denominator"]
splits = splits["Stock Splits"]
splits = splits[["Stock Splits"]]
return dividends, splits
def set_df_tz(df, interval, tz):
if df.index.tz is None:
df.index = df.index.tz_localize("UTC")
df.index = df.index.tz_convert(tz)
if interval in ["1d","1w","1wk","1mo","3mo"]:
# If localizing a midnight during DST transition hour when clocks roll back,
# meaning clock hits midnight twice, then use the 2nd (ambiguous=True)
df.index = _pd.to_datetime(df.index.date).tz_localize(tz, ambiguous=True)
return df
def fix_Yahoo_returning_live_separate(quotes, interval, tz_exchange):
# Yahoo bug fix. If market is open today then Yahoo normally returns
# todays data as a separate row from rest-of week/month interval in above row.
# Seems to depend on what exchange e.g. crypto OK.
# Fix = merge them together
n = quotes.shape[0]
if n > 1:
dt1 = quotes.index[n-1]
dt2 = quotes.index[n-2]
if quotes.index.tz is None:
dt1 = dt1.tz_localize("UTC")
dt2 = dt2.tz_localize("UTC")
dt1 = dt1.tz_convert(tz_exchange)
dt2 = dt2.tz_convert(tz_exchange)
if interval in ["1wk", "1mo", "3mo"]:
if interval == "1wk":
last_rows_same_interval = dt1.year==dt2.year and dt1.week==dt2.week
elif interval == "1mo":
last_rows_same_interval = dt1.month==dt2.month
elif interval == "3mo":
last_rows_same_interval = dt1.year==dt2.year and dt1.quarter==dt2.quarter
if last_rows_same_interval:
# Last two rows are within same interval
idx1 = quotes.index[n-1]
idx2 = quotes.index[n-2]
if _np.isnan(quotes.loc[idx2,"Open"]):
quotes.loc[idx2,"Open"] = quotes["Open"][n-1]
# Note: nanmax() & nanmin() ignores NaNs
quotes.loc[idx2,"High"] = _np.nanmax([quotes["High"][n-1], quotes["High"][n-2]])
quotes.loc[idx2,"Low"] = _np.nanmin([quotes["Low"][n-1], quotes["Low"][n-2]])
quotes.loc[idx2,"Close"] = quotes["Close"][n-1]
if "Adj High" in quotes.columns:
quotes.loc[idx2,"Adj High"] = _np.nanmax([quotes["Adj High"][n-1], quotes["Adj High"][n-2]])
if "Adj Low" in quotes.columns:
quotes.loc[idx2,"Adj Low"] = _np.nanmin([quotes["Adj Low"][n-1], quotes["Adj Low"][n-2]])
if "Adj Close" in quotes.columns:
quotes.loc[idx2,"Adj Close"] = quotes["Adj Close"][n-1]
quotes.loc[idx2,"Volume"] += quotes["Volume"][n-1]
quotes = quotes.drop(quotes.index[n-1])
# Similar bug in daily data except most data is simply duplicated
# - exception is volume, *slightly* greater on final row (and matches website)
elif interval=="1d":
if dt1.date() == dt2.date():
# Last two rows are on same day. Drop second-to-last row
quotes = quotes.drop(quotes.index[n-2])
return quotes
def safe_merge_dfs(df_main, df_sub, interval):
# Carefully merge 'df_sub' onto 'df_main'
# If naive merge fails, try again with reindexing df_sub:
# 1) if interval is weekly or monthly, then try with index set to start of week/month
# 2) if still failing then manually search through df_main.index to reindex df_sub
if df_sub.shape[0] == 0:
raise Exception("No data to merge")
df_sub_backup = df_sub.copy()
data_cols = [c for c in df_sub.columns if c not in df_main]
if len(data_cols) > 1:
raise Exception("Expected 1 data col")
data_col = data_cols[0]
def _reindex_events(df, new_index, data_col_name):
if len(new_index) == len(set(new_index)):
# No duplicates, easy
df.index = new_index
return df
df["_NewIndex"] = new_index
# Duplicates present within periods but can aggregate
if data_col_name == "Dividends":
# Add
df = df.groupby("_NewIndex").sum()
df.index.name = None
elif data_col_name == "Stock Splits":
# Product
df = df.groupby("_NewIndex").prod()
df.index.name = None
else:
raise Exception("New index contains duplicates but unsure how to aggregate for '{}'".format(data_col_name))
if "_NewIndex" in df.columns:
df = df.drop("_NewIndex",axis=1)
return df
df = df_main.join(df_sub)
f_na = df[data_col].isna()
data_lost = sum(~f_na) < df_sub.shape[0]
if not data_lost:
return df
# Lost data during join()
if interval in ["1wk","1mo","3mo"]:
# Backdate all df_sub.index dates to start of week/month
if interval == "1wk":
new_index = _pd.PeriodIndex(df_sub.index, freq='W').to_timestamp()
elif interval == "1mo":
new_index = _pd.PeriodIndex(df_sub.index, freq='M').to_timestamp()
elif interval == "3mo":
new_index = _pd.PeriodIndex(df_sub.index, freq='Q').to_timestamp()
new_index = new_index.tz_localize(df.index.tz, ambiguous=True)
df_sub = _reindex_events(df_sub, new_index, data_col)
df = df_main.join(df_sub)
f_na = df[data_col].isna()
data_lost = sum(~f_na) < df_sub.shape[0]
if not data_lost:
return df
# Lost data during join(). Manually check each df_sub.index date against df_main.index to
# find matching interval
df_sub = df_sub_backup.copy()
new_index = [-1]*df_sub.shape[0]
for i in range(df_sub.shape[0]):
dt_sub_i = df_sub.index[i]
if dt_sub_i in df_main.index:
new_index[i] = dt_sub_i ; continue
# Found a bad index date, need to search for near-match in df_main (same week/month)
fixed = False
for j in range(df_main.shape[0]-1):
dt_main_j0 = df_main.index[j]
dt_main_j1 = df_main.index[j+1]
if (dt_main_j0 <= dt_sub_i) and (dt_sub_i < dt_main_j1):
fixed = True
if interval.endswith('h') or interval.endswith('m'):
# Must also be same day
fixed = (dt_main_j0.date() == dt_sub_i.date()) and (dt_sub_i.date() == dt_main_j1.date())
if fixed:
dt_sub_i = dt_main_j0 ; break
if not fixed:
last_main_dt = df_main.index[df_main.shape[0]-1]
diff = dt_sub_i - last_main_dt
if interval == "1mo" and last_main_dt.month == dt_sub_i.month:
dt_sub_i = last_main_dt ; fixed = True
elif interval == "3mo" and last_main_dt.year == dt_sub_i.year and last_main_dt.quarter == dt_sub_i.quarter:
dt_sub_i = last_main_dt ; fixed = True
elif interval == "1wk":
if last_main_dt.week == dt_sub_i.week:
dt_sub_i = last_main_dt ; fixed = True
elif (dt_sub_i>=last_main_dt) and (dt_sub_i-last_main_dt < _datetime.timedelta(weeks=1)):
# With some specific start dates (e.g. around early Jan), Yahoo
# messes up start-of-week, is Saturday not Monday. So check
# if same week another way
dt_sub_i = last_main_dt ; fixed = True
elif interval == "1d" and last_main_dt.day == dt_sub_i.day:
dt_sub_i = last_main_dt ; fixed = True
elif interval == "1h" and last_main_dt.hour == dt_sub_i.hour:
dt_sub_i = last_main_dt ; fixed = True
elif interval.endswith('m') or interval.endswith('h'):
td = _pd.to_timedelta(interval)
if (dt_sub_i>=last_main_dt) and (dt_sub_i-last_main_dt < td):
dt_sub_i = last_main_dt ; fixed = True
new_index[i] = dt_sub_i
df_sub = _reindex_events(df_sub, new_index, data_col)
df = df_main.join(df_sub)
f_na = df[data_col].isna()
data_lost = sum(~f_na) < df_sub.shape[0]
if data_lost:
## Not always possible to match events with trading, e.g. when released pre-market.
## So have to append to bottom with nan prices.
## But should only be impossible with intra-day price data.
if interval.endswith('m') or interval.endswith('h'):
f_missing = ~df_sub.index.isin(df.index)
df_sub_missing = df_sub[f_missing]
keys = set(["Adj Open", "Open", "Adj High", "High", "Adj Low", "Low", "Adj Close", "Close"]).intersection(df.columns)
df_sub_missing[list(keys)] = _np.nan
df = _pd.concat([df, df_sub_missing], sort=True)
else:
raise Exception("Lost data during merge despite all attempts to align data (see above)")
return df
def fix_Yahoo_dst_issue(df, interval):
if interval in ["1d","1w","1wk"]:
# These intervals should start at time 00:00. But for some combinations of date and timezone,
@@ -265,6 +461,14 @@ def fix_Yahoo_dst_issue(df, interval):
return df
def is_valid_timezone(tz: str) -> bool:
try:
_tz.timezone(tz)
except UnknownTimeZoneError:
return False
return True
class ProgressBar:
def __init__(self, iterations, text='completed'):
self.text = text
@@ -315,44 +519,157 @@ class ProgressBar:
return str(self.prog_bar)
# Simple file cache of ticker->timezone:
_cache_dp = None
def get_cache_dirpath():
if _cache_dp is None:
dp = _os.path.join(_ad.user_cache_dir(), "py-yfinance")
else:
dp = _os.path.join(_cache_dp, "py-yfinance")
return dp
def set_tz_cache_location(dp):
global _cache_dp
_cache_dp = dp
# ---------------------------------
# TimeZone cache related code
# ---------------------------------
def cache_lookup_tkr_tz(tkr):
fp = _os.path.join(get_cache_dirpath(), "tkr-tz.csv")
if not _os.path.isfile(fp):
class _KVStore:
"""Simpel Sqlite backed key/value store, key and value are strings. Should be thread safe."""
def __init__(self, filename):
self._cache_mutex = Lock()
with self._cache_mutex:
self.conn = _sqlite3.connect(filename, timeout=10, check_same_thread=False)
self.conn.execute('pragma journal_mode=wal')
self.conn.execute('create table if not exists "kv" (key TEXT primary key, value TEXT) without rowid')
self.conn.commit()
_atexit.register(self.close)
def close(self):
if self.conn is not None:
with self._cache_mutex:
self.conn.close()
self.conn = None
def get(self, key: str) -> Union[str, None]:
"""Get value for key if it exists else returns None"""
item = self.conn.execute('select value from "kv" where key=?', (key,))
if item:
return next(item, (None,))[0]
def set(self, key: str, value: str) -> str:
with self._cache_mutex:
self.conn.execute('replace into "kv" (key, value) values (?,?)', (key, value))
self.conn.commit()
def bulk_set(self, kvdata: Dict[str, str]):
records = tuple(i for i in kvdata.items())
with self._cache_mutex:
self.conn.executemany('replace into "kv" (key, value) values (?,?)', records)
self.conn.commit()
def delete(self, key: str):
with self._cache_mutex:
self.conn.execute('delete from "kv" where key=?', (key,))
self.conn.commit()
class _TzCacheException(Exception):
pass
class _TzCache:
"""Simple sqlite file cache of ticker->timezone"""
def __init__(self):
self._tz_db = None
self._setup_cache_folder()
def _setup_cache_folder(self):
if not _os.path.isdir(self._db_dir):
try:
_os.makedirs(self._db_dir)
except OSError as err:
raise _TzCacheException("Error creating TzCache folder: '{}' reason: {}"
.format(self._db_dir, err))
elif not (_os.access(self._db_dir, _os.R_OK) and _os.access(self._db_dir, _os.W_OK)):
raise _TzCacheException("Cannot read and write in TzCache folder: '{}'"
.format(self._db_dir, ))
def lookup(self, tkr):
return self.tz_db.get(tkr)
def store(self, tkr, tz):
if tz is None:
self.tz_db.delete(tkr)
elif self.tz_db.get(tkr) is not None:
raise Exception("Tkr {} tz already in cache".format(tkr))
else:
self.tz_db.set(tkr, tz)
@property
def _db_dir(self):
global _cache_dir
return _os.path.join(_cache_dir, "py-yfinance")
@property
def tz_db(self):
# lazy init
if self._tz_db is None:
self._tz_db = _KVStore(_os.path.join(self._db_dir, "tkr-tz.db"))
self._migrate_cache_tkr_tz()
return self._tz_db
def _migrate_cache_tkr_tz(self):
"""Migrate contents from old ticker CSV-cache to SQLite db"""
fp = _os.path.join(self._db_dir, "tkr-tz.csv")
if not _os.path.isfile(fp):
return None
df = _pd.read_csv(fp, index_col="Ticker")
self.tz_db.bulk_set(df.to_dict()['Tz'])
_os.remove(fp)
class _TzCacheDummy:
"""Dummy cache to use if tz cache is disabled"""
def lookup(self, tkr):
return None
df = _pd.read_csv(fp)
f = df["Ticker"] == tkr
if sum(f) == 0:
def store(self, tkr, tz):
pass
@property
def tz_db(self):
return None
return df["Tz"][f].iloc[0]
def cache_store_tkr_tz(tkr,tz):
df = _pd.DataFrame({"Ticker":[tkr], "Tz":[tz]})
dp = get_cache_dirpath()
if not _os.path.isdir(dp):
_os.makedirs(dp)
fp = _os.path.join(dp, "tkr-tz.csv")
if not _os.path.isfile(fp):
df.to_csv(fp, index=False)
return
def get_tz_cache():
"""
Get the timezone cache, initializes it and creates cache folder if needed on first call.
If folder cannot be created for some reason it will fall back to initialize a
dummy cache with same interface as real cash.
"""
# as this can be called from multiple threads, protect it.
with _cache_init_lock:
global _tz_cache
if _tz_cache is None:
try:
_tz_cache = _TzCache()
except _TzCacheException as err:
print("Failed to create TzCache, reason: {}".format(err))
print("TzCache will not be used.")
print("Tip: You can direct cache to use a different location with 'set_tz_cache_location(mylocation)'")
_tz_cache = _TzCacheDummy()
df_all = _pd.read_csv(fp)
f = df_all["Ticker"]==tkr
if sum(f) > 0:
raise Exception("Tkr {} tz already in cache".format(tkr))
return _tz_cache
_pd.concat([df_all,df]).to_csv(fp, index=False)
_cache_dir = _ad.user_cache_dir()
_cache_init_lock = Lock()
_tz_cache = None
def set_tz_cache_location(cache_dir: str):
"""
Sets the path to create the "py-yfinance" cache folder in.
Useful if the default folder returned by "appdir.user_cache_dir()" is not writable.
Must be called before cache is used (that is, before fetching tickers).
:param cache_dir: Path to use for caches
:return: None
"""
global _cache_dir, _tz_cache
assert _tz_cache is None, "Time Zone cache already initialized, setting path must be done before cache is created"
_cache_dir = cache_dir

View File

@@ -1 +1 @@
version = "0.1.81"
version = "0.2.0rc1"