I'm trying to profile an excel file, it is a very small data set, only 30 columns and 535 rows, but when I run the profile_report function it stops each time in a different percentage but always has the same message:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-41-283dd2cb2000> in <module>
1 df=pd.read_excel(path_working+'Documents/Information/'+'sample.xlsx')
2 profile = df.profile_report(title='Sample Exploratory')
----> 3 profile.to_file(path_working+'sample.html')
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_file(self, output_file, silent)
276 create_html_assets(output_file)
277
--> 278 data = self.to_html()
279
280 if output_file.suffix != ".html":
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_html(self)
384
385 """
--> 386 return self.html
387
388 def to_json(self) -> str:
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in html(self)
199 def html(self):
200 if self._html is None:
--> 201 self._html = self._render_html()
202 return self._html
203
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in _render_html(self)
306 from pandas_profiling.report.presentation.flavours import HTMLReport
307
--> 308 report = self.report
309
310 disable_progress_bar = not config["progress_bar"].get(bool)
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in report(self)
193 def report(self):
194 if self._report is None:
--> 195 self._report = get_report_structure(self.description_set)
196 return self._report
197
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in description_set(self)
172 def description_set(self):
173 if self._description_set is None:
--> 174 self._description_set = describe_df(
175 self.title, self.df, self.summarizer, self.typeset, self._sample
176 )
~\anaconda3\lib\site-packages\pandas_profiling\model\describe.py in describe(title, df, summarizer, typeset, sample)
72 total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar
73 ) as pbar:
---> 74 series_description = get_series_descriptions(df, summarizer, typeset, pbar)
75
76 pbar.set_postfix_str("Get variable types")
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in get_series_descriptions(df, summarizer, typeset, pbar)
97 # TODO: use `Pool` for Linux-based systems
98 with multiprocessing.pool.ThreadPool(pool_size) as executor:
---> 99 for i, (column, description) in enumerate(
100 executor.imap_unordered(multiprocess_1d, args)
101 ):
~\anaconda3\lib\multiprocessing\pool.py in next(self, timeout)
866 if success:
867 return value
--> 868 raise value
869
870 __next__ = next # XXX
~\anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
123 job, i, func, args, kwds = task
124 try:
--> 125 result = (True, func(*args, **kwds))
126 except Exception as e:
127 if wrap_exception and func is not _helper_reraises_exception:
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in multiprocess_1d(args)
76 """
77 column, series = args
---> 78 return column, describe_1d(series, summarizer, typeset)
79
80 pool_size = config["pool_size"].get(int)
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in describe_1d(series, summarizer, typeset)
50 vtype = typeset.detect_type(series)
51
---> 52 return summarizer.summarize(series, dtype=vtype)
53
54
~\anaconda3\lib\site-packages\pandas_profiling\model\summarizer.py in summarize(self, series, dtype)
54 """
55 summarizer_func = compose(self.summary_map.get(dtype, []))
---> 56 _, summary = summarizer_func(series, {"type": dtype})
57 return summary
58
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
17 def func(f, g):
18 def func2(*x):
---> 19 res = g(*x)
20 if type(res) == bool:
21 return f(*x)
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in inner(series, summary)
70 if not summary["hashable"]:
71 return series, summary
---> 72 return fn(series, summary)
73
74 return inner
~\anaconda3\lib\site-packages\visions\utils\series_utils.py in inner(series, state, *args, **kwargs)
40 return False
41
---> 42 return fn(series, state, *args, **kwargs)
43
44 return inner
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in describe_numeric_1d(series, summary)
208
209 if chi_squared_threshold > 0.0:
--> 210 stats["chi_squared"] = chi_square(finite_values)
211
212 stats["range"] = stats["max"] - stats["min"]
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_helpers.py in chi_square(values, histogram)
352 def chi_square(values=None, histogram=None):
353 if histogram is None:
--> 354 histogram, _ = np.histogram(values, bins="auto")
355 return dict(chisquare(histogram)._asdict())
356
<__array_function__ internals> in histogram(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\lib\histograms.py in histogram(a, bins, range, normed, weights, density)
790 a, weights = _ravel_and_check_weights(a, weights)
791
--> 792 bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)
793
794 # Histogram is an integer or a float array depending on the weights.
~\anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_bin_edges(a, bins, range, weights)
444
445 # bin edges must be computed
--> 446 bin_edges = np.linspace(
447 first_edge, last_edge, n_equal_bins + 1,
448 endpoint=True, dtype=bin_type)
<__array_function__ internals> in linspace(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\core\function_base.py in linspace(start, stop, num, endpoint, retstep, dtype, axis)
126
127 delta = stop - start
--> 128 y = _nx.arange(0, num, dtype=dt).reshape((-1,) + (1,) * ndim(delta))
129 # In-place multiplication y *= delta/div is faster, but prevents the multiplicant
130 # from overriding what class is produced, and thus prevents, e.g. use of Quantities,
MemoryError: Unable to allocate 1.75 EiB for an array with shape (251938683619878560,) and data type float64
I ran the same code in a different python installation and it ran fine.
Thank you all in advance and let me know if you need more information.
This is a bug in numpy.histogram (https://github.com/numpy/numpy/issues/10297), also reported on SO (Numpy histogram extremely slow on small data set).
This error is caused by the call to
np.histogram(x, bin='auto')
. When input has very large values, the "auto" method can fail while trying to generate an enormous number of bin that cannot fit in ram.As a workaround, you can remove the large values manually before generating the report.