Performance Of Creating New Dataframe
Solution 1:
After some painful debugging I can confirm the sequence that the slow one takes, in the DataFrame ctor :
elif isinstance(data, (list, types.GeneratorType)):
if isinstance(data, types.GeneratorType):
data = list(data)
if len(data) > 0:
if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields
arrays, columns = _to_arrays(data, columns, dtype=dtype)
Here it tests the type of the passed data, as it's list-like it then tries to test each element for it's type, it's not expecting a list containing a np array so then it comes here:
def_to_arrays(data, columns, coerce_float=False, dtype=None):
"""
Return list of arrays, columns
"""ifisinstance(data, DataFrame):
if columns isnotNone:
arrays = [data._ixs(i, axis=1).values
for i, col inenumerate(data.columns) if col in columns]
else:
columns = data.columns
arrays = [data._ixs(i, axis=1).values for i inrange(len(columns))]
return arrays, columns
ifnotlen(data):
ifisinstance(data, np.ndarray):
columns = data.dtype.names
if columns isnotNone:
return [[]] * len(columns), columns
return [], [] # columns if columns is not None else []ifisinstance(data[0], (list, tuple)):
return _list_to_arrays(data, columns, coerce_float=coerce_float,
dtype=dtype)
then here:
def_list_to_arrays(data, columns, coerce_float=False, dtype=None):
iflen(data) > 0andisinstance(data[0], tuple):
content = list(lib.to_object_array_tuples(data).T)
else:
# list of lists
content = list(lib.to_object_array(data).T)
return _convert_object_array(content, columns, dtype=dtype,
coerce_float=coerce_float)
and finally here:
def_convert_object_array(content, columns, coerce_float=False, dtype=None):
if columns isNone:
columns = _default_index(len(content))
else:
iflen(columns) != len(content): # pragma: no cover# caller's responsibility to check for this...raise AssertionError('%d columns passed, passed data had %s ''columns' % (len(columns), len(content)))
# provide soft conversion of object dtypesdefconvert(arr):
if dtype != objectand dtype != np.object:
arr = lib.maybe_convert_objects(arr, try_float=coerce_float)
arr = _possibly_cast_to_datetime(arr, dtype)
return arr
arrays = [convert(arr) for arr in content]
return arrays, columns
You can see that there is no optimisation in the construction it performs and it essentially just iterates through every element, converts it (which will copy it) and returns a list of arrays.
For the other path, as the np array shape and dtypes are more pandas friendly it can take a view on the data or copy if required but it already knows enough to optimise the construction
Solution 2:
@EdChum comments is on point
just looking at how pandas handle list data vs array data you will understand quickly that passing a list is more complicated.
array:
elifisinstance(data, (np.ndarray, Series, Index)):
if data.dtype.names:
data_columns = list(data.dtype.names)
data = dict((k, data[k]) for k in data_columns)
if columns isNone:
columns = data_columns
mgr = self._init_dict(data, index, columns, dtype=dtype)
elifgetattr(data, 'name', None):
mgr = self._init_dict({data.name: data}, index, columns,
dtype=dtype)
else:
mgr = self._init_ndarray(data, index, columns, dtype=dtype,copy=copy)
now if it's a list:
elifisinstance(data, (list, types.GeneratorType)):
ifisinstance(data, types.GeneratorType):
data = list(data)
iflen(data) > 0:
if is_list_like(data[0]) andgetattr(data[0], 'ndim', 1) == 1:
if is_named_tuple(data[0]) and columns isNone:
columns = data[0]._fields
arrays, columns = _to_arrays(data, columns, dtype=dtype)
columns = _ensure_index(columns)
# set the indexif index isNone:
ifisinstance(data[0], Series):
index = _get_names_from_index(data)
elifisinstance(data[0], Categorical):
index = _default_index(len(data[0]))
else:
index = _default_index(len(data))
mgr = _arrays_to_mgr(arrays, columns, index, columns,
dtype=dtype)
else:
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
copy=copy)
Post a Comment for "Performance Of Creating New Dataframe"