Source code for bokeh.charts.data_source

''' The classes and functionality used to transform data inputs to consistent
types.

'''
from __future__ import absolute_import

from copy import copy
from itertools import chain
from operator import itemgetter

import numpy as np
import pandas as pd
from six import iteritems
from six.moves import zip

from bokeh.core.has_props import HasProps
from bokeh.core.properties import bokeh_integer_types, Datetime, Float, List, String
from bokeh.models.sources import ColumnDataSource

from .properties import Column, ColumnLabel
from .stats import Bins, Stat
from .utils import collect_attribute_columns, gen_column_names, special_columns

COMPUTED_COLUMN_NAMES = ['_charts_ones']
ARRAY_TYPES = [tuple, list, np.ndarray, pd.Series]
TABLE_TYPES = [dict, pd.DataFrame]
DEFAULT_DIMS = ['x', 'y']
DEFAULT_REQ_DIMS = [['x'], ['y'], ['x', 'y']]


class ColumnAssigner(HasProps):
    """Defines behavior for assigning columns to dimensions.

    This class is used to collect assignments between columns and :class:`Builder`
    dimensions when none are provided. The :class:`ChartDataSource` receives a
    ColumnAssigner from each :class:`Builder`, which can implement custom behavior.

    Each subclass must implement the :meth:`get_assignment` method, which returns
    a `dict` mapping between each dimension in `dims` and one or more column names,
    or `None` if no assignment is made for the associated dimension.

    """
    dims = List(String, help="""
        The list of dimension names that are associated with the :class:`Builder`. The
        ColumnAssigner should return a dict with each dimension as a key when the
        :meth:`get_assignment` method is called.
        """)
    attrs = List(String, help="""
        This list of attribute names that are associated with the :class:`Builder`. These
        can be used to alter which dimensions are assigned which columns, versus which
        attributes are assigned which columns.
        """)

    def __init__(self, df=None, **properties):
        """Create the assigner.

        Args:
            df (:class:`pandas.DataFrame`, optional): the data source to use for
                assigning columns from
            **properties: any attribute of the ColumnAssigner

        """
        if df is not None:
            self._df = df
        super(ColumnAssigner, self).__init__(**properties)

    def get_assignment(self, selections=None):
        raise NotImplementedError('You must return map between each dim and selection.')


class OrderedAssigner(ColumnAssigner):
    """Assigns one column for each dimension that is not an attribute, in order.

    This is the default column assigner for the :class:`Builder`.

    """

    def get_assignment(self, selections=None):
        """Get a mapping between dimension and selection when none are provided."""
        if selections is None or len(list(selections.keys())) == 0:
            dims = [dim for dim in self.dims if dim not in self.attrs]
            return {dim: sel for dim, sel in
                    zip(dims, self._df.columns.tolist())}
        else:
            return selections


class NumericalColumnsAssigner(ColumnAssigner):
    """Assigns all numerical columns to the y dimension."""

    def get_assignment(self, selections=None):
        if isinstance(selections, dict):
            x = selections.get('x')
            y = selections.get('y')
        else:
            x = None
            y = None
            selections = {}

        # filter down to only the numerical columns
        df = self._df._get_numeric_data()
        num_cols = df.columns.tolist()

        if x is not None and y is None:
            y = [col for col in num_cols if col not in list(x)]
        elif x is None:
            x = 'index'

            if y is None:
                y = num_cols

        selections['x'] = x
        selections['y'] = y
        return selections


class DataOperator(HasProps):
    """An operation that transforms data before it is used for plotting."""
    columns = List(ColumnLabel(), default=None, help="""
        List of columns to perform operation on.""")

    def apply(self, data):
        raise NotImplementedError('Each data operator must implement the apply method.')

    def __repr__(self):
        col_str = ', '.join(self.columns)
        return '%s(%s)' % (self.__class__.__name__, col_str)


[docs]class DataGroup(object): """Contains subset of data and metadata about it. The DataGroup contains a map from the labels of each attribute associated with an :class:`AttrSpec` to the value of the attribute assigned to the DataGroup. """
[docs] def __init__(self, label, data, attr_specs): """Create a DataGroup for the data, with a label and associated attributes. Args: label (str): the label for the group based on unique values of each column data (:class:`pandas.DataFrame`): the subset of data associated with the group attr_specs dict(str, :class:`AttrSpec`): mapping between attribute name and the associated :class:`AttrSpec`. """ self.label = label self.data = data self.attr_specs = attr_specs
[docs] def get_values(self, selection): """Get the data associated with the selection of columns. Args: selection (List(Str) or Str): the column or columns selected Returns: :class:`pandas.DataFrame` """ if isinstance(selection, str): return self.data[selection] elif isinstance(selection, list) and len(selection) == 1: return self.data[selection[0]] elif isinstance(selection, list) and len(selection) > 1: return self.data[selection] else: return None
@property def source(self): """The :class:`ColumnDataSource` representation of the DataFrame.""" return ColumnDataSource(self.data) def __getitem__(self, spec_name): """Get the value of the :class:`AttrSpec` associated with `spec_name`.""" return self.attr_specs[spec_name] def __repr__(self): return '<DataGroup(%s) - attributes: %s>' % (str(self.label), self.attr_specs) def __len__(self): return len(self.data.index) @property def attributes(self): return list(self.attr_specs.keys())
[docs] def to_dict(self): row = {} if self.label is not None: row.update(self.label) row['chart_index'] = tuple(self.label.items()) else: row['chart_index'] = None row.update(self.attr_specs) return row
def groupby(df, **specs): """Convenience iterator around pandas groupby and attribute specs. Args: df (:class:`~pandas.DataFrame`): The entire data source being used for the Chart. **specs: Name, :class:`AttrSpec` pairing, used to identify the lowest level where the data is grouped. Yields: :class:`DataGroup`: each unique group of data to be used to produce glyphs """ spec_cols = collect_attribute_columns(**specs) # if there was any input for chart attributes, which require grouping if spec_cols: # df = df.sort(columns=spec_cols) for name, data in df.groupby(spec_cols, sort=False): attrs = {} group_label = {} for spec_name, spec in iteritems(specs): if spec.columns is not None: # get index of the unique column values grouped on for this spec name_idx = tuple([spec_cols.index(col) for col in spec.columns]) if isinstance(name, tuple): # this handles the case of utilizing one or more and overlapping # column names for different attrs # name (label) is a tuple of the column values # we extract only the data associated with the columns that this attr spec was configured with label = itemgetter(*name_idx)(name) cols = itemgetter(*name_idx)(spec_cols) else: label = name cols = spec_cols[0] if not isinstance(label, tuple): label = (label, ) if not isinstance(cols, list) and not isinstance(cols, tuple): cols = [cols] for col, value in zip(cols, label): group_label[col] = value else: label = None # get attribute value for this spec, given the unique column values associated with it attrs[spec_name] = spec[label] yield DataGroup(label=group_label, data=data, attr_specs=attrs) # collect up the defaults from the attribute specs else: attrs = {} for spec_name, spec in iteritems(specs): attrs[spec_name] = spec[None] yield DataGroup(label=None, data=df, attr_specs=attrs)
[docs]class ChartDataSource(object): """Validates, normalizes, groups, and assigns Chart attributes to groups. Supported inputs are: - **Array-like**: list, tuple, :class:`numpy.ndarray`, :class:`pandas.Series` - **Table-like**: - records: list(dict) - columns: dict(list), :class:`pandas.DataFrame`, or blaze resource Converts inputs that could be treated as table-like data to pandas DataFrame, which is used for assigning attributes to data groups. """
[docs] def __init__(self, df, dims=None, required_dims=None, selections=None, column_assigner=OrderedAssigner, attrs=None, **kwargs): """Create a :class:`ChartDataSource`. Args: df (:class:`pandas.DataFrame`): the original data source for the chart dims (List(Str), optional): list of valid dimensions for the chart. required_dims (List(List(Str)), optional): list of list of valid dimensional selections for the chart. selections (Dict(String, List(Column)), optional): mapping between a dimension and the column name(s) associated with it. This represents what the user selected for the current chart. column_assigner (:class:`ColumnAssigner`, optional): a reference to a ColumnAssigner class, which is used to collect dimension column assignment when keyword arguments aren't provided. The default value is :class:`OrderedAssigner`, which assumes you want to assign each column or array to each dimension of the chart in order that they are received. attrs (list(str)): list of attribute names the chart uses """ if dims is None: dims = DEFAULT_DIMS if required_dims is None: required_dims = DEFAULT_REQ_DIMS self.input_type = kwargs.pop('input_type', None) self.attrs = attrs or [] self._data = df.copy(deep=False) self._dims = dims self.operations = [] self._required_dims = required_dims self.column_assigner = column_assigner( df=self._data, dims=list(self._dims), attrs=self.attrs, ) self._selections = self.get_selections(selections, **kwargs) self.setup_derived_columns() self.apply_operations() self.meta = self.collect_metadata(df) self._validate_selections()
@property def attr_specs(self): return {dim: val for dim, val in iteritems(self._selections) if dim in self.attrs}
[docs] def get_selections(self, selections, **kwargs): """Maps chart dimensions to selections and checks input requirements. Returns: mapping between each dimension and the selected columns. If no selection is made for a dimension, then the dimension will be associated with `None`. """ select_map = {} # extract selections from kwargs using dimension list for dim in self._dims: dim_select = kwargs.pop(dim, None) if dim_select is not None: select_map[dim] = dim_select # handle case where dimension kwargs were not provided if len(select_map.keys()) == 0: if selections is None: # if no selections are provided, we assume they were provided in order select_map = self.column_assigner.get_assignment() elif isinstance(selections, dict): if len(selections.keys()) != 0: # selections were specified in inputs select_map = selections else: # selection input type isn't valid raise ValueError('selections input must be provided as: \ dict(dimension: column) or None') else: # provide opportunity for column assigner to apply custom logic select_map = self.column_assigner.get_assignment(selections=select_map) # make sure each dimension is represented in the selection map for dim in self._dims: if dim not in select_map: select_map[dim] = None return select_map
[docs] def apply_operations(self): """Applies each data operation.""" # ToDo: Handle order of operation application, see GoG pg. 71 selections = self._selections.copy() for dim, select in iteritems(self._selections): if isinstance(select, DataOperator): self._data = select.apply(self) selections[dim] = select.name # handle any stat operations to derive and aggregate data if isinstance(select, Stat): if isinstance(select, Bins): self._data = select.apply(self) selections[dim] = select.centers_column else: raise TypeError('Stat input of %s for %s is not supported.' % (select.__class__, dim)) self.operations.append(select) self._selections = selections
[docs] def setup_derived_columns(self): """Attempt to add special case columns to the DataFrame for the builder.""" for dim in self._dims: dim_selection = self[dim] if dim_selection is not None and isinstance(dim_selection, str) and \ dim_selection in special_columns and dim_selection not in \ self.df.columns.tolist(): self._data[dim_selection] = special_columns[dim_selection]( self._data)
def __getitem__(self, dim): """Get the columns selected for the given dimension name. e.g. dim='x' Returns: the columns selected as a str or list(str). If the dimension is not in `_selections`, `None` is returned. """ if dim in self._selections: return self._selections[dim] else: return None def __setitem__(self, dim, value): self._selections[dim] = value self.setup_derived_columns()
[docs] def stack_measures(self, measures, ids=None, var_name='variable', value_name='value'): """De-pivots `_data` from a 'wide' to 'tall' layout. A wide table is one where the column names represent a categorical variable and each contains only the values associated with each unique value of the categorical variable. This method uses the :func:`pandas.melt` function with additional logic to make sure that the same data source can have multiple operations applied, and so all other columns are maintained through the stacking process. Example: .. note:: This example is fairly low level and is not something the typical user should worry about. The interface for data transformations from the user perspective are the :ref:`bokeh_charts_functions`. >>> data = {'a': [1, 2, 3, 4], ... 'b': [2, 3, 4, 5], ... 'month': ['jan', 'jan', 'feb', 'feb'] ... } >>> ds = ChartDataSource.from_data(data) >>> ds['x'] =['a', 'b'] # say we selected a and b for dimension x We may want to combine 'a' and 'b' together. The final data would look like the following: >>> ds.stack_measures(['c', 'd'], var_name='c_d_variable', ... value_name='c_d_value') >>> ds.df Out[35]: month a_b_variable a_b_value 0 jan a 1 1 jan a 2 2 feb a 3 3 feb a 4 4 jan b 2 5 jan b 3 6 feb b 4 7 feb b 5 The transformed data will use the `var_name` and `value_name` inputs to name the columns. These derived columns can then be used as a single column to reference the values and the labels of the data. In the example, I could plot a_b_value vs month, and color by a_b_variable. What this does for you over the :meth:`pandas.melt` method is that it will apply the :class:`DataOperator` for a dimension if it exists (e.g. :class:`Blend`, generated by :func:`blend`), and it will try to handle the id columns for you so you don't lose other columns with the melt transformation. Returns: None """ # ToDo: Handle multiple blend operations for dim in self._dims: # find the dimension the measures are associated with selection = self._selections[dim] # because a user can generate data operators assigned to dimensions, # the columns must be gathered from the data operator if isinstance(selection, DataOperator): dim_cols = selection.columns else: dim_cols = selection # handle case where multiple stacking operations create duplicate cols if var_name in self.df.columns.tolist(): var_name += '_' if measures == dim_cols: self._selections[dim] = value_name if ids is not None: # handle case where we already stacked by one dimension/attribute if all([measure in self.df.columns.tolist() for measure in measures]): self._data = pd.melt(self._data, id_vars=ids, value_vars=measures, var_name=var_name, value_name=value_name) else: ids = list(set(self._data.columns) - set(measures)) self._data = pd.melt(self._data, id_vars=ids, value_vars=measures, var_name=var_name, value_name=value_name)
[docs] def groupby(self, **specs): """ Iterable of chart attribute specifications, associated with columns. Iterates over DataGroup, which represent the lowest level of data that is assigned to the attributes for plotting. Yields: a DataGroup, which contains metadata and attributes assigned to the group of data """ if len(specs) == 0: raise ValueError( 'You must provide one or more Attribute Specs to support iteration.') return groupby(self._data, **specs)
[docs] def join_attrs(self, **attr_specs): """Produce new DataFrame from source data and `AttrSpec` provided. Args: **attr_specs (str, `AttrSpec`, optional): pairs of names and attribute spec objects. This is optional and not required only if the `ChartDataSource` already contains references to the attribute specs. Returns: pd.DataFrame: a new dataframe that includes a column for each of the attribute specs joined in, plus one special column called `chart_index`, which contains the unique items between the different attribute specs. """ df = self._data.copy() if not attr_specs: attr_specs = self.attr_specs groups = [] rows = [] no_index = False for group in self.groupby(**attr_specs): if group.label is None: no_index = True groups.append(group) rows.append(group.to_dict()) if no_index: attr_data = pd.DataFrame.from_records([groups[0].to_dict()]) df['join_column'] = 'join_value' attr_data['join_column'] = 'join_value' df = pd.merge(df, attr_data, on='join_column') del df['join_column'] else: attr_data = pd.DataFrame.from_records(rows) cols = list(groups[0].label.keys()) df = pd.merge(df, attr_data, how='left', on=cols) return df
@classmethod
[docs] def from_data(cls, *args, **kwargs): """Automatically handle all valid inputs. Attempts to use any data that can be represented in a Table-like format, along with any generated requirements, to produce a :class:`ChartDataSource`. Internally, these data types are generated, so that a :class:`pandas.DataFrame` can be generated. Identifies inputs that are array vs table like, handling them accordingly. If possible, existing column names are used, otherwise column names are generated. Returns: :class:`ColumnDataSource` """ # make sure the attributes are not considered for data inputs attrs = kwargs.pop('attrs', None) if attrs is not None: # look at each arg, and keep it if it isn't a string, or if it is a string, # make sure that it isn't the name of an attribute args = [arg for arg in args if (not isinstance(arg, str) or isinstance(arg, str) and arg not in attrs)] arrays = [arg for arg in args if cls.is_array(arg)] tables = [arg for arg in args if cls.is_table(arg) or cls.is_list_dicts(arg)] # only accept array-like or table-like input for simplicity if len(arrays) > 0 and len(tables) > 0: raise TypeError('Only input either array or table data.') # kwarg or list of arrays data if len(arrays) == 0 and len(tables) == 0: # handle list of lists list_dims = [k for k, v in iteritems(kwargs) if (cls.is_list_arrays(v) or cls.is_array(v)) and k is not 'dims' and k is not 'required_dims'] if len(list_dims) > 0: arrays = [kwargs[dim] for dim in list_dims] if cls.is_list_arrays(arrays): arrays = list(chain.from_iterable(arrays)) col_names = gen_column_names(len(arrays)) # reassign kwargs to new columns new_kwargs = kwargs.copy() for dim in list_dims: dim_cols = [] dim_inputs = kwargs[dim] if not cls.is_list_arrays(dim_inputs) and not all([cls.is_array( dim_input) for dim_input in dim_inputs]): dim_inputs = [dim_inputs] # if we passed one to many literal array/list, match to cols for dim_input in dim_inputs: for array, col_name in zip(arrays, col_names): if pd.Series.all(pd.Series(array) == pd.Series(dim_input)): # add col to all cols and dim_cols.append(col_name) # if only single column selected, pull it out of list if len(dim_cols) == 1: dim_cols = dim_cols[0] new_kwargs[dim] = dim_cols # setup kwargs to process as if we received arrays as args kwargs = new_kwargs kwargs['columns'] = col_names else: # non-kwargs list of lists arrays = [arg for arg in args if cls.is_list_arrays(arg)] if attrs is not None: kwargs['attrs'] = attrs # handle array-like if len(arrays) > 0: kwargs['input_type'] = 'iter_array' return cls.from_arrays(arrays, **kwargs) # handle table-like elif len(tables) > 0: # single table input only if len(tables) != 1: raise TypeError('Input a single table data type.') else: table = tables[0] # dict of arrays if isinstance(table, dict): if all([cls.is_array(col) for col in table.values()]): kwargs['input_type'] = 'dict_array' return cls(df=pd.DataFrame.from_dict(data=table), **kwargs) else: raise TypeError('Input of table-like dict must be column-oriented.') # list of dicts elif cls.is_list_dicts(table): kwargs['input_type'] = 'list_dicts' return cls(df=pd.DataFrame.from_records(data=table), **kwargs) # blaze data source # elif string or datasource # Todo: implement handling of blaze data sources if available # pandas dataframe elif isinstance(table, pd.DataFrame): kwargs['input_type'] = 'DataFrame' return cls(df=table, **kwargs) # unrecognized input type else: raise TypeError( 'Unable to recognize inputs for conversion to dataframe for %s' % type(table))
@staticmethod
[docs] def is_list_arrays(data): """Verify if input data is a list of array-like data. Returns: bool """ valid = False # ToDo: handle groups of arrays types, list of lists of arrays # avoid case where we have a list with one list of values in it if (isinstance(data, list) and len(data) == 1 and isinstance(data[0], list) and not isinstance(data[0][0], list) and not ChartDataSource.is_array(data[0][0])): return valid # really want to check for nested lists, where each list might have lists if isinstance(data, list): if all([ChartDataSource.is_array(col) for col in data]): valid = True # equivalent of list of arrays is a table-like numpy ndarray elif isinstance(data, np.ndarray): if len(data.shape) == 2: valid = True return valid
@property def df(self): return self._data @property def source(self): return ColumnDataSource(self.df) @staticmethod def _collect_dimensions(**kwargs): """Returns dimensions by name from kwargs. Returns: iterable(str): iterable of dimension names as strings """ dims = kwargs.pop(kwargs, None) if not dims: return 'x', 'y' else: return dims @classmethod
[docs] def from_arrays(cls, arrays, column_names=None, **kwargs): """Produce :class:`ColumnDataSource` from array-like data. Returns: :class:`ColumnDataSource` """ # handle list of arrays if any(cls.is_list_arrays(array) for array in arrays): list_of_arrays = copy(arrays) arrays = list(chain.from_iterable(arrays)) column_names = column_names or gen_column_names(len(arrays)) cols = copy(column_names) dims = kwargs.get('dims', DEFAULT_DIMS) # derive column selections for dim, list_of_array in zip(dims, list_of_arrays): sel = [cols.pop(0) for _ in list_of_array] kwargs[dim] = sel else: column_names = column_names or gen_column_names(len(arrays)) # try to replace auto names with Series names for i, array in enumerate(arrays): if isinstance(array, pd.Series): name = array.name if name not in column_names and name is not None: column_names[i] = name table = {column_name: array for column_name, array in zip(column_names, arrays)} return cls(df=pd.DataFrame.from_dict(data=table), **kwargs)
@classmethod
[docs] def from_dict(cls, data, **kwargs): """Produce :class:`ColumnDataSource` from table-like dict. Returns: :class:`ColumnDataSource` """ return cls(df=pd.DataFrame.from_dict(data), **kwargs)
@staticmethod
[docs] def is_table(data): """Verify if data is table-like. Inspects the types and structure of data. Returns: bool """ return (ChartDataSource._is_valid(data, TABLE_TYPES) or ChartDataSource.is_list_dicts(data))
@staticmethod
[docs] def is_list_dicts(data): """Verify if data is row-oriented, table-like data. Returns: bool """ return isinstance(data, list) and all([isinstance(row, dict) for row in data])
@staticmethod
[docs] def is_array(data): """Verify if data is array-like. Returns: bool """ if ChartDataSource.is_list_dicts(data): # list of dicts is table type return False else: return ChartDataSource._is_valid(data, ARRAY_TYPES)
@staticmethod def _is_valid(data, types): """Checks for each type against data. Args: data: a generic source of data types: a list of classes Returns: bool """ return any([isinstance(data, valid_type) for valid_type in types]) def _validate_selections(self): """Raises selection error if selections are not valid compared to requirements. Returns: None """ required_dims = self._required_dims selections = self._selections dims = [dim for dim, sel in iteritems(selections) if sel is not None] if self.attrs is not None: dims = [dim for dim in dims if dim not in self.attrs] # look for a match for selections to dimensional requirements if len(required_dims) > 0: for req in required_dims: # ToDo: handle column type specifications if len(dims) < len(req): # not an exact match continue if all([dim in req for dim in dims]): # found a match to the requirements return # If we reach this point, nothing was validated, let's # construct useful error messages error_str = 'Did not receive a valid combination of selections.\n\nValid configurations are: %s' + \ '\nReceived inputs are: %s' + \ '\n\nAvailable columns are: %s' req_str = [' and '.join(['%s = <Any Column>' % dim for dim in required_dim]) for required_dim in required_dims] selection_str = ['%s = %s' % (str(dim), str(sel)) for dim, sel in iteritems(selections) if sel is not None] raise ValueError(error_str % ( ' or '.join(req_str), ', '.join(selection_str), ', '.join(self.columns))) else: # if we have no dimensional requirements, they all pass return @staticmethod
[docs] def is_number(value): """Verifies that value is a numerical type. Returns: bool """ if isinstance(value, pd.Series): return Column(Float).is_valid(value) else: numbers = (float,) + bokeh_integer_types return isinstance(value, numbers)
@staticmethod
[docs] def is_datetime(value): """Verifies that value is a valid Datetime type, or can be converted to it. Returns: bool """ try: dt = Datetime(value) dt # shut up pyflakes return True except ValueError: return False
@staticmethod
[docs] def collect_metadata(data): """Introspect which columns match to which types of data.""" # ToDo: implement column metadata collection return {}
@property def columns(self): """All column names associated with the data. Returns: List(Str) """ return self._data.columns @property def index(self): """The index for the :class:`pandas.DataFrame` data source.""" return self._data.index @property def values(self): return self._data.values @staticmethod
[docs] def is_computed(column): """Verify if the column provided matches to known computed columns. Returns: bool """ if column in COMPUTED_COLUMN_NAMES: return True else: return False