MemoryError when using pandas_profiling profile_report

1.7k Views Asked by At

I'm trying to profile an excel file, it is a very small data set, only 30 columns and 535 rows, but when I run the profile_report function it stops each time in a different percentage but always has the same message:

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-41-283dd2cb2000> in <module>
      1 df=pd.read_excel(path_working+'Documents/Information/'+'sample.xlsx')
      2 profile = df.profile_report(title='Sample Exploratory')
----> 3 profile.to_file(path_working+'sample.html')

~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_file(self, output_file, silent)
    276                 create_html_assets(output_file)
    277 
--> 278             data = self.to_html()
    279 
    280             if output_file.suffix != ".html":

~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_html(self)
    384 
    385         """
--> 386         return self.html
    387 
    388     def to_json(self) -> str:

~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in html(self)
    199     def html(self):
    200         if self._html is None:
--> 201             self._html = self._render_html()
    202         return self._html
    203 

~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in _render_html(self)
    306         from pandas_profiling.report.presentation.flavours import HTMLReport
    307 
--> 308         report = self.report
    309 
    310         disable_progress_bar = not config["progress_bar"].get(bool)

~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in report(self)
    193     def report(self):
    194         if self._report is None:
--> 195             self._report = get_report_structure(self.description_set)
    196         return self._report
    197 

~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in description_set(self)
    172     def description_set(self):
    173         if self._description_set is None:
--> 174             self._description_set = describe_df(
    175                 self.title, self.df, self.summarizer, self.typeset, self._sample
    176             )

~\anaconda3\lib\site-packages\pandas_profiling\model\describe.py in describe(title, df, summarizer, typeset, sample)
     72         total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar
     73     ) as pbar:
---> 74         series_description = get_series_descriptions(df, summarizer, typeset, pbar)
     75 
     76         pbar.set_postfix_str("Get variable types")

~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in get_series_descriptions(df, summarizer, typeset, pbar)
     97         # TODO: use `Pool` for Linux-based systems
     98         with multiprocessing.pool.ThreadPool(pool_size) as executor:
---> 99             for i, (column, description) in enumerate(
    100                 executor.imap_unordered(multiprocess_1d, args)
    101             ):

~\anaconda3\lib\multiprocessing\pool.py in next(self, timeout)
    866         if success:
    867             return value
--> 868         raise value
    869 
    870     __next__ = next                    # XXX

~\anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
    123         job, i, func, args, kwds = task
    124         try:
--> 125             result = (True, func(*args, **kwds))
    126         except Exception as e:
    127             if wrap_exception and func is not _helper_reraises_exception:

~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in multiprocess_1d(args)
     76         """
     77         column, series = args
---> 78         return column, describe_1d(series, summarizer, typeset)
     79 
     80     pool_size = config["pool_size"].get(int)

~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in describe_1d(series, summarizer, typeset)
     50         vtype = typeset.detect_type(series)
     51 
---> 52     return summarizer.summarize(series, dtype=vtype)
     53 
     54 

~\anaconda3\lib\site-packages\pandas_profiling\model\summarizer.py in summarize(self, series, dtype)
     54         """
     55         summarizer_func = compose(self.summary_map.get(dtype, []))
---> 56         _, summary = summarizer_func(series, {"type": dtype})
     57         return summary
     58 

~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
     21                 return f(*x)
     22             else:
---> 23                 return f(*res)
     24 
     25         return func2

~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
     21                 return f(*x)
     22             else:
---> 23                 return f(*res)
     24 
     25         return func2

~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
     21                 return f(*x)
     22             else:
---> 23                 return f(*res)
     24 
     25         return func2

~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
     17     def func(f, g):
     18         def func2(*x):
---> 19             res = g(*x)
     20             if type(res) == bool:
     21                 return f(*x)

~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in inner(series, summary)
     70         if not summary["hashable"]:
     71             return series, summary
---> 72         return fn(series, summary)
     73 
     74     return inner

~\anaconda3\lib\site-packages\visions\utils\series_utils.py in inner(series, state, *args, **kwargs)
     40                 return False
     41 
---> 42         return fn(series, state, *args, **kwargs)
     43 
     44     return inner

~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in describe_numeric_1d(series, summary)
    208 
    209     if chi_squared_threshold > 0.0:
--> 210         stats["chi_squared"] = chi_square(finite_values)
    211 
    212     stats["range"] = stats["max"] - stats["min"]

~\anaconda3\lib\site-packages\pandas_profiling\model\summary_helpers.py in chi_square(values, histogram)
    352 def chi_square(values=None, histogram=None):
    353     if histogram is None:
--> 354         histogram, _ = np.histogram(values, bins="auto")
    355     return dict(chisquare(histogram)._asdict())
    356 

<__array_function__ internals> in histogram(*args, **kwargs)

~\anaconda3\lib\site-packages\numpy\lib\histograms.py in histogram(a, bins, range, normed, weights, density)
    790     a, weights = _ravel_and_check_weights(a, weights)
    791 
--> 792     bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)
    793 
    794     # Histogram is an integer or a float array depending on the weights.

~\anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_bin_edges(a, bins, range, weights)
    444 
    445         # bin edges must be computed
--> 446         bin_edges = np.linspace(
    447             first_edge, last_edge, n_equal_bins + 1,
    448             endpoint=True, dtype=bin_type)

<__array_function__ internals> in linspace(*args, **kwargs)

~\anaconda3\lib\site-packages\numpy\core\function_base.py in linspace(start, stop, num, endpoint, retstep, dtype, axis)
    126 
    127     delta = stop - start
--> 128     y = _nx.arange(0, num, dtype=dt).reshape((-1,) + (1,) * ndim(delta))
    129     # In-place multiplication y *= delta/div is faster, but prevents the multiplicant
    130     # from overriding what class is produced, and thus prevents, e.g. use of Quantities,

MemoryError: Unable to allocate 1.75 EiB for an array with shape (251938683619878560,) and data type float64

I ran the same code in a different python installation and it ran fine.

Thank you all in advance and let me know if you need more information.

1

There are 1 best solutions below

0
On

This is a bug in numpy.histogram (https://github.com/numpy/numpy/issues/10297), also reported on SO (Numpy histogram extremely slow on small data set).

This error is caused by the call to np.histogram(x, bin='auto'). When input has very large values, the "auto" method can fail while trying to generate an enormous number of bin that cannot fit in ram.

As a workaround, you can remove the large values manually before generating the report.