Source code for bokeh.util.sampledata
#-----------------------------------------------------------------------------
# Copyright (c) 2012 - 2023, Anaconda, Inc., and Bokeh Contributors.
# All rights reserved.
#
# The full license is in the file LICENSE.txt, distributed with this software.
#-----------------------------------------------------------------------------
''' Helper functions for downloading and accessing sample data.
'''
#-----------------------------------------------------------------------------
# Boilerplate
#-----------------------------------------------------------------------------
from __future__ import annotations # isort:skip
# NOTE: skip logging imports so that this module may be run as a script
#-----------------------------------------------------------------------------
# Imports
#-----------------------------------------------------------------------------
# Standard library imports
import hashlib
import json
from os.path import splitext
from pathlib import Path
from sys import stdout
from typing import TYPE_CHECKING, Any, TextIO
from urllib.parse import urljoin
from urllib.request import urlopen
# NOTE: since downloading sampledata is not a common occurrence, non-stdlib
# imports are generally deferrered in this module
if TYPE_CHECKING:
import pandas as pd
#-----------------------------------------------------------------------------
# Globals and constants
#-----------------------------------------------------------------------------
__all__ = (
'download',
)
#-----------------------------------------------------------------------------
# General API
#-----------------------------------------------------------------------------
[docs]def download(progress: bool = True) -> None:
''' Download larger data sets for various Bokeh examples.
'''
data_dir = external_data_dir(create=True)
print(f"Using data directory: {data_dir}")
# HTTP requests are cheaper for us, and there is nothing private to protect
s3 = 'http://sampledata.bokeh.org'
files = json.load(open(Path(__file__).parent / "sampledata.json"))
for filename, md5 in files:
real_name, ext = splitext(filename)
if ext == '.zip':
if not splitext(real_name)[1]:
real_name += ".csv"
else:
real_name += ext
real_path = data_dir /real_name
if real_path.exists():
local_md5 = hashlib.md5(open(real_path, 'rb').read()).hexdigest()
if local_md5 == md5:
print(f"Skipping {filename!r} (checksum match)")
continue
else:
print(f"Re-fetching {filename!r} (checksum mismatch)")
_download_file(s3, filename, data_dir, progress=progress)
#-----------------------------------------------------------------------------
# Dev API
#-----------------------------------------------------------------------------
def external_csv(module: str, name: str, **kw: Any) -> pd.DataFrame:
'''
'''
import pandas as pd
return pd.read_csv(external_path(name), **kw)
def external_data_dir(create: bool = False) -> Path:
'''
'''
try:
import yaml
except ImportError:
raise RuntimeError("'yaml' and 'pyyaml' are required to use bokeh.sampledata functions")
bokeh_dir = _bokeh_dir(create=create)
data_dir = bokeh_dir / "data"
try:
config = yaml.safe_load(open(bokeh_dir / 'config'))
data_dir = Path.expanduser(config['sampledata_dir'])
except (OSError, TypeError):
pass
if not data_dir.exists():
if not create:
raise RuntimeError('bokeh sample data directory does not exist, please execute bokeh.sampledata.download()')
print(f"Creating {data_dir} directory")
try:
data_dir.mkdir()
except OSError:
raise RuntimeError(f"could not create bokeh data directory at {data_dir}")
else:
if not data_dir.is_dir():
raise RuntimeError(f"{data_dir} exists but is not a directory")
return data_dir
def external_path(filename: str | Path) -> Path:
data_dir = external_data_dir()
fn = data_dir / filename
if not fn.exists() or not fn.is_file():
raise RuntimeError(f"Could not locate external data file {fn}. Please execute bokeh.sampledata.download()")
return fn
def package_csv(module: str, name: str, **kw: Any) -> pd.DataFrame:
'''
'''
import pandas as pd
return pd.read_csv(package_path(name), **kw)
def package_dir() -> Path:
'''
'''
return Path(__file__).parents[1].joinpath("sampledata", "_data").resolve()
def package_path(filename: str | Path) -> Path:
'''
'''
return package_dir() / filename
def open_csv(filename: str | Path) -> TextIO:
'''
'''
return open(filename, newline='', encoding='utf8')
#-----------------------------------------------------------------------------
# Private API
#-----------------------------------------------------------------------------
def _bokeh_dir(create: bool = False) -> Path:
bokeh_dir = Path("~").expanduser() / ".bokeh"
if not bokeh_dir.exists():
if not create: return bokeh_dir
print(f"Creating {bokeh_dir} directory")
try:
bokeh_dir.mkdir()
except OSError:
raise RuntimeError(f"could not create bokeh config directory at {bokeh_dir}")
else:
if not bokeh_dir.is_dir():
raise RuntimeError(f"{bokeh_dir} exists but is not a directory")
return bokeh_dir
def _download_file(base_url: str, filename: str, data_dir: Path, progress: bool = True) -> None:
'''
'''
# These are actually somewhat expensive imports that added ~5% to overall
# typical bokeh import times. Since downloading sampledata is not a common
# action, we defer them to inside this function.
from zipfile import ZipFile
file_url = urljoin(base_url, filename)
file_path = data_dir / filename
url = urlopen(file_url)
with open(file_path, 'wb') as file:
file_size = int(url.headers["Content-Length"])
print(f"Downloading: {filename} ({file_size} bytes)")
fetch_size = 0
block_size = 16384
while True:
data = url.read(block_size)
if not data:
break
fetch_size += len(data)
file.write(data)
if progress:
status = f"\r{fetch_size:< 10d} [{fetch_size*100.0/file_size:6.2f}%%]"
stdout.write(status)
stdout.flush()
if progress:
print()
real_name, ext = splitext(filename)
if ext == '.zip':
if not splitext(real_name)[1]:
real_name += ".csv"
print(f"Unpacking: {real_name}")
with ZipFile(file_path, 'r') as zip_file:
zip_file.extract(real_name, data_dir)
file_path.unlink()
#-----------------------------------------------------------------------------
# Code
#-----------------------------------------------------------------------------
# This is necessary so that we can run the sampledata download code in the
# release build, before an actual package exists.
if __name__ == "__main__":
download(progress=False)