Skip to content Skip to sidebar Skip to footer

Performance Of Creating New Dataframe

I was very surpised about timings of creating DataFrames in this question: #[30000 rows x 2 columns] df = pd.concat([pd.DataFrame({'fruits': ['apples', 'grapes', 'figs'],

Solution 1:

After some painful debugging I can confirm the sequence that the slow one takes, in the DataFrame ctor :

 elif isinstance(data, (list, types.GeneratorType)):
            if isinstance(data, types.GeneratorType):
                data = list(data)
            if len(data) > 0:
                if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
                    if is_named_tuple(data[0]) and columns is None:
                        columns = data[0]._fields
                    arrays, columns = _to_arrays(data, columns, dtype=dtype)

Here it tests the type of the passed data, as it's list-like it then tries to test each element for it's type, it's not expecting a list containing a np array so then it comes here:

def_to_arrays(data, columns, coerce_float=False, dtype=None):
    """
    Return list of arrays, columns
    """ifisinstance(data, DataFrame):
        if columns isnotNone:
            arrays = [data._ixs(i, axis=1).values
                      for i, col inenumerate(data.columns) if col in columns]
        else:
            columns = data.columns
            arrays = [data._ixs(i, axis=1).values for i inrange(len(columns))]

        return arrays, columns

    ifnotlen(data):
        ifisinstance(data, np.ndarray):
            columns = data.dtype.names
            if columns isnotNone:
                return [[]] * len(columns), columns
        return [], []  # columns if columns is not None else []ifisinstance(data[0], (list, tuple)):
        return _list_to_arrays(data, columns, coerce_float=coerce_float,
                               dtype=dtype)

then here:

def_list_to_arrays(data, columns, coerce_float=False, dtype=None):
    iflen(data) > 0andisinstance(data[0], tuple):
        content = list(lib.to_object_array_tuples(data).T)
    else:
        # list of lists
        content = list(lib.to_object_array(data).T)
    return _convert_object_array(content, columns, dtype=dtype,
                                 coerce_float=coerce_float)

and finally here:

def_convert_object_array(content, columns, coerce_float=False, dtype=None):
    if columns isNone:
        columns = _default_index(len(content))
    else:
        iflen(columns) != len(content):  # pragma: no cover# caller's responsibility to check for this...raise AssertionError('%d columns passed, passed data had %s ''columns' % (len(columns), len(content)))

    # provide soft conversion of object dtypesdefconvert(arr):
        if dtype != objectand dtype != np.object:
            arr = lib.maybe_convert_objects(arr, try_float=coerce_float)
            arr = _possibly_cast_to_datetime(arr, dtype)
        return arr

    arrays = [convert(arr) for arr in content]

    return arrays, columns

You can see that there is no optimisation in the construction it performs and it essentially just iterates through every element, converts it (which will copy it) and returns a list of arrays.

For the other path, as the np array shape and dtypes are more pandas friendly it can take a view on the data or copy if required but it already knows enough to optimise the construction

Solution 2:

@EdChum comments is on point

just looking at how pandas handle list data vs array data you will understand quickly that passing a list is more complicated.

array:

elifisinstance(data, (np.ndarray, Series, Index)):
            if data.dtype.names:
                data_columns = list(data.dtype.names)
                data = dict((k, data[k]) for k in data_columns)
                if columns isNone:
                    columns = data_columns
                mgr = self._init_dict(data, index, columns, dtype=dtype)
            elifgetattr(data, 'name', None):
                mgr = self._init_dict({data.name: data}, index, columns,
                                      dtype=dtype)
            else:
                mgr = self._init_ndarray(data, index, columns, dtype=dtype,copy=copy)

now if it's a list:

elifisinstance(data, (list, types.GeneratorType)):
    ifisinstance(data, types.GeneratorType):
        data = list(data)
    iflen(data) > 0:
        if is_list_like(data[0]) andgetattr(data[0], 'ndim', 1) == 1:
            if is_named_tuple(data[0]) and columns isNone:
                columns = data[0]._fields
            arrays, columns = _to_arrays(data, columns, dtype=dtype)
            columns = _ensure_index(columns)

            # set the indexif index isNone:
                ifisinstance(data[0], Series):
                    index = _get_names_from_index(data)
                elifisinstance(data[0], Categorical):
                    index = _default_index(len(data[0]))
                else:
                    index = _default_index(len(data))

            mgr = _arrays_to_mgr(arrays, columns, index, columns,
                                 dtype=dtype)
        else:
            mgr = self._init_ndarray(data, index, columns, dtype=dtype,
                                     copy=copy)

Post a Comment for "Performance Of Creating New Dataframe"