#-----------------------------------------------------------------------------# Copyright (c) 2012 - 2023, Anaconda, Inc., and Bokeh Contributors.# All rights reserved.## The full license is in the file LICENSE.txt, distributed with this software.#-----------------------------------------------------------------------------''' Helper functions for downloading and accessing sample data.'''#-----------------------------------------------------------------------------# Boilerplate#-----------------------------------------------------------------------------from__future__importannotations# isort:skip# NOTE: skip logging imports so that this module may be run as a script#-----------------------------------------------------------------------------# Imports#-----------------------------------------------------------------------------# Standard library importsimporthashlibimportjsonfromos.pathimportsplitextfrompathlibimportPathfromsysimportstdoutfromtypingimportTYPE_CHECKING,Any,TextIOfromurllib.parseimporturljoinfromurllib.requestimporturlopen# NOTE: since downloading sampledata is not a common occurrence, non-stdlib# imports are generally deferrered in this moduleifTYPE_CHECKING:importpandasaspd#-----------------------------------------------------------------------------# Globals and constants#-----------------------------------------------------------------------------__all__=('download',)#-----------------------------------------------------------------------------# General API#-----------------------------------------------------------------------------
[docs]defdownload(progress:bool=True)->None:''' Download larger data sets for various Bokeh examples. '''data_dir=external_data_dir(create=True)print(f"Using data directory: {data_dir}")# HTTP requests are cheaper for us, and there is nothing private to protects3='http://sampledata.bokeh.org'forfile_name,md5inmetadata().items():real_path=data_dir/real_name(file_name)ifreal_path.exists():withopen(real_path,"rb")asfile:data=file.read()local_md5=hashlib.md5(data).hexdigest()iflocal_md5==md5:print(f"Skipping {file_name!r} (checksum match)")continueprint(f"Fetching {file_name!r}")_download_file(s3,file_name,data_dir,progress=progress)
#-----------------------------------------------------------------------------# Dev API#-----------------------------------------------------------------------------defreal_name(name:str)->str:real_name,ext=splitext(name)ifext==".zip":ifnotsplitext(real_name)[1]:returnf"{real_name}.csv"else:returnreal_nameelse:returnnamedefmetadata()->dict[str,str]:with(Path(__file__).parent/"sampledata.json").open("rb")asf:returndict(json.load(f))defexternal_csv(module:str,name:str,**kw:Any)->pd.DataFrame:importpandasaspdreturnpd.read_csv(external_path(name),**kw)defexternal_data_dir(*,create:bool=False)->Path:try:importyamlexceptImportError:raiseRuntimeError("'yaml' and 'pyyaml' are required to use bokeh.sampledata functions")bokeh_dir=_bokeh_dir(create=create)data_dir=bokeh_dir/"data"try:config=yaml.safe_load(open(bokeh_dir/'config'))data_dir=Path.expanduser(config['sampledata_dir'])except(OSError,TypeError):passifnotdata_dir.exists():ifnotcreate:raiseRuntimeError('bokeh sample data directory does not exist, please execute bokeh.sampledata.download()')print(f"Creating {data_dir} directory")try:data_dir.mkdir()exceptOSError:raiseRuntimeError(f"could not create bokeh data directory at {data_dir}")else:ifnotdata_dir.is_dir():raiseRuntimeError(f"{data_dir} exists but is not a directory")returndata_dirdefexternal_path(file_name:str)->Path:data_dir=external_data_dir()file_path=data_dir/file_nameifnotfile_path.exists()ornotfile_path.is_file():raiseRuntimeError(f"Could not locate external data file {file_path}. Please execute bokeh.sampledata.download()")withopen(file_path,"rb")asfile:meta=metadata()known_md5=meta.get(file_name)or \
meta.get(f"{file_name}.zip")or \
meta.get(f"{splitext(file_name)[0]}.zip")ifknown_md5isNone:raiseRuntimeError(f"Unknown external data file {file_name}")local_md5=hashlib.md5(file.read()).hexdigest()ifknown_md5!=local_md5:raiseRuntimeError(f"External data file {file_path} is outdated. Please execute bokeh.sampledata.download()")returnfile_pathdefpackage_csv(module:str,name:str,**kw:Any)->pd.DataFrame:importpandasaspdreturnpd.read_csv(package_path(name),**kw)defpackage_dir()->Path:returnPath(__file__).parents[1].joinpath("sampledata","_data").resolve()defpackage_path(filename:str|Path)->Path:returnpackage_dir()/filenamedefload_json(filename:str|Path)->Any:withopen(filename,"rb")asf:returnjson.load(f)defopen_csv(filename:str|Path)->TextIO:returnopen(filename,newline='',encoding='utf8')#-----------------------------------------------------------------------------# Private API#-----------------------------------------------------------------------------def_bokeh_dir(create:bool=False)->Path:bokeh_dir=Path("~").expanduser()/".bokeh"ifnotbokeh_dir.exists():ifnotcreate:returnbokeh_dirprint(f"Creating {bokeh_dir} directory")try:bokeh_dir.mkdir()exceptOSError:raiseRuntimeError(f"could not create bokeh config directory at {bokeh_dir}")else:ifnotbokeh_dir.is_dir():raiseRuntimeError(f"{bokeh_dir} exists but is not a directory")returnbokeh_dirdef_download_file(base_url:str,filename:str,data_dir:Path,progress:bool=True)->None:# These are actually somewhat expensive imports that added ~5% to overall# typical bokeh import times. Since downloading sampledata is not a common# action, we defer them to inside this function.fromzipfileimportZipFilefile_url=urljoin(base_url,filename)file_path=data_dir/filenameurl=urlopen(file_url)withopen(file_path,'wb')asfile:file_size=int(url.headers["Content-Length"])print(f"Downloading: {filename} ({file_size} bytes)")fetch_size=0block_size=16384whileTrue:data=url.read(block_size)ifnotdata:breakfetch_size+=len(data)file.write(data)ifprogress:status=f"\r{fetch_size:< 10d} [{fetch_size*100.0/file_size:6.2f}%%]"stdout.write(status)stdout.flush()ifprogress:print()real_name,ext=splitext(filename)ifext=='.zip':ifnotsplitext(real_name)[1]:real_name+=".csv"print(f"Unpacking: {real_name}")withZipFile(file_path,'r')aszip_file:zip_file.extract(real_name,data_dir)file_path.unlink()#-----------------------------------------------------------------------------# Code#-----------------------------------------------------------------------------# This is necessary so that we can run the sampledata download code in the# release build, before an actual package exists.if__name__=="__main__":download(progress=False)