Customize pandas representation so that it doesn't freeze anymore. Fixes #695

This commit is contained in:
Fabio Zadrozny 2021-11-18 14:44:14 -03:00
parent 30c150af91
commit 2a8758d18f
10 changed files with 310 additions and 17 deletions

View file

@ -108,7 +108,7 @@ from _pydevd_bundle import pydevd_vm_type
import sys
import traceback
from _pydevd_bundle.pydevd_utils import quote_smart as quote, compare_object_attrs_key, \
notify_about_gevent_if_needed, isinstance_checked, ScopeRequest, getattr_checked
notify_about_gevent_if_needed, isinstance_checked, ScopeRequest, getattr_checked, Timer
from _pydev_bundle import pydev_log, fsnotify
from _pydev_bundle.pydev_log import exception as pydev_log_exception
from _pydev_bundle import _pydev_completer
@ -821,10 +821,12 @@ class InternalGetVariable(InternalThreadCommand):
if not (_typeName == "OrderedDict" or val_dict.__class__.__name__ == "OrderedDict" or IS_PY36_OR_GREATER):
keys.sort(key=compare_object_attrs_key)
timer = Timer()
for k in keys:
val = val_dict[k]
evaluate_full_value = pydevd_xml.should_evaluate_full_value(val)
xml.write(pydevd_xml.var_to_xml(val, k, evaluate_full_value=evaluate_full_value))
timer.report_if_compute_repr_attr_slow(self.attributes, k, type(val))
xml.write("</xml>")
cmd = dbg.cmd_factory.make_get_variable_message(self.sequence, xml.getvalue())
@ -1256,6 +1258,7 @@ def internal_evaluate_expression_json(py_db, request, thread_id):
variables_response = pydevd_base_schema.build_response(request, kwargs={'body':body})
py_db.writer.add_command(NetCommand(CMD_RETURN, 0, variables_response, is_json=True))
def _evaluate_response_return_exception(py_db, request, exc_type, exc, initial_tb):
try:
tb = initial_tb

View file

@ -249,6 +249,18 @@ def as_float_in_env(env_key, default):
env_key, value))
def as_int_in_env(env_key, default):
value = os.getenv(env_key)
if value is None:
return default
try:
return int(value)
except Exception:
raise RuntimeError(
'Error: expected the env variable: %s to be set to a int value. Found: %s' % (
env_key, value))
# If true in env, use gevent mode.
SUPPORT_GEVENT = is_true_in_env('GEVENT_SUPPORT')
@ -293,6 +305,14 @@ NEXT_VALUE_SEPARATOR = "__pydev_val__"
BUILTINS_MODULE_NAME = '__builtin__' if IS_PY2 else 'builtins'
SHOW_DEBUG_INFO_ENV = is_true_in_env(('PYCHARM_DEBUG', 'PYDEV_DEBUG', 'PYDEVD_DEBUG'))
# Pandas customization.
PANDAS_MAX_ROWS = as_int_in_env('PYDEVD_PANDAS_MAX_ROWS', 300)
PANDAS_MAX_COLS = as_int_in_env('PYDEVD_PANDAS_MAX_COLS', 300)
PANDAS_MAX_COLWIDTH = as_int_in_env('PYDEVD_PANDAS_MAX_COLWIDTH', 80)
# If getting an attribute or computing some value is too slow, let the user know if the given timeout elapses.
PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT = as_float_in_env('PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT', 0.15)
# This timeout is used to track the time to send a message saying that the evaluation
# is taking too long and possible mitigations.
PYDEVD_WARN_EVALUATION_TIMEOUT = as_float_in_env('PYDEVD_WARN_EVALUATION_TIMEOUT', 3.)

View file

@ -123,6 +123,7 @@ DONT_TRACE = {
'pydevd_net_command_factory_json.py': PYDEV_FILE,
'pydevd_net_command_factory_xml.py': PYDEV_FILE,
'pydevd_plugin_numpy_types.py': PYDEV_FILE,
'pydevd_plugin_pandas_types.py': PYDEV_FILE,
'pydevd_plugin_utils.py': PYDEV_FILE,
'pydevd_plugins_django_form_str.py': PYDEV_FILE,
'pydevd_process_net_command.py': PYDEV_FILE,

View file

@ -1,5 +1,5 @@
from _pydev_bundle import pydev_log
from _pydevd_bundle.pydevd_utils import hasattr_checked, DAPGrouper
from _pydevd_bundle.pydevd_utils import hasattr_checked, DAPGrouper, Timer
try:
import StringIO
except:
@ -183,6 +183,8 @@ class DefaultResolver:
# optimize the operation by removing as many items as possible in the
# first filters, leaving fewer items for later filters
timer = Timer()
cls = type(var)
for name in names:
try:
name_as_str = name
@ -204,6 +206,9 @@ class DefaultResolver:
traceback.print_exc(file=strIO)
attr = strIO.getvalue()
finally:
timer.report_if_getting_attr_slow(cls, name_as_str)
d[name_as_str] = attr
return d, used___dict__

View file

@ -11,7 +11,7 @@ from _pydev_bundle import pydev_log
from _pydevd_bundle import pydevd_vars
from _pydev_bundle.pydev_imports import Exec
from _pydevd_bundle.pydevd_frame_utils import FramesList
from _pydevd_bundle.pydevd_utils import ScopeRequest, DAPGrouper
from _pydevd_bundle.pydevd_utils import ScopeRequest, DAPGrouper, Timer
class _AbstractVariable(object):
@ -40,6 +40,7 @@ class _AbstractVariable(object):
:param dict fmt:
Format expected by the DAP (keys: 'hex': bool, 'rawString': bool)
'''
timer = Timer()
safe_repr = SafeRepr()
if fmt is not None:
safe_repr.convert_to_hex = fmt.get('hex', False)
@ -89,6 +90,7 @@ class _AbstractVariable(object):
if len(attributes) > 0:
var_data['presentationHint'] = {'attributes': attributes}
timer.report_if_compute_repr_attr_slow('', name, type_name)
return var_data
def get_children_variables(self, fmt=None, scope=None):

View file

@ -13,10 +13,12 @@ try:
except:
from urllib.parse import quote # @UnresolvedImport
import time
import inspect
import sys
from _pydevd_bundle.pydevd_constants import IS_PY3K, USE_CUSTOM_SYS_CURRENT_FRAMES, IS_PYPY, SUPPORT_GEVENT, \
GEVENT_SUPPORT_NOT_SET_MSG, GENERATED_LEN_ATTR_NAME
GEVENT_SUPPORT_NOT_SET_MSG, GENERATED_LEN_ATTR_NAME, PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT, \
get_global_debugger
from _pydev_imps._pydev_saved_modules import threading
@ -454,3 +456,53 @@ def interrupt_main_thread(main_thread):
main_thread._thread.interrupt() # Jython
except:
pydev_log.exception('Error on interrupt main thread fallback.')
class Timer(object):
def __init__(self, min_diff=PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT):
self.min_diff = min_diff
self._curr_time = time.time()
def print_time(self, msg='Elapsed:'):
old = self._curr_time
new = self._curr_time = time.time()
diff = new - old
if diff >= self.min_diff:
print('%s: %.2fs' % (msg, diff))
def _report_slow(self, compute_msg, *args):
old = self._curr_time
new = self._curr_time = time.time()
diff = new - old
if diff >= self.min_diff:
py_db = get_global_debugger()
if py_db is not None:
msg = compute_msg(diff, *args)
py_db.writer.add_command(py_db.cmd_factory.make_warning_message(msg))
def report_if_compute_repr_attr_slow(self, attrs_tab_separated, attr_name, attr_type):
self._report_slow(self._compute_repr_slow, attrs_tab_separated, attr_name, attr_type)
def _compute_repr_slow(self, diff, attrs_tab_separated, attr_name, attr_type):
try:
attr_type = attr_type.__name__
except:
pass
if attrs_tab_separated:
return 'pydevd warning: Computing repr of %s.%s (%s) was slow (took %.2fs)\n' % (
attrs_tab_separated.replace('\t', '.'), attr_name, attr_type, diff)
else:
return 'pydevd warning: Computing repr of %s (%s) was slow (took %.2fs)\n' % (
attr_name, attr_type, diff)
def report_if_getting_attr_slow(self, cls, attr_name):
self._report_slow(self._compute_get_attr_slow, cls, attr_name)
def _compute_get_attr_slow(self, diff, cls, attr_name):
try:
cls = cls.__name__
except:
pass
return 'pydevd warning: Getting attribute %s.%s was slow (took %.2fs)\n' % (cls, attr_name, diff)

View file

@ -7,7 +7,7 @@ from _pydevd_bundle.pydevd_constants import dict_iter_items, dict_keys, IS_PY3K,
DEFAULT_VALUE
from _pydev_bundle.pydev_imports import quote
from _pydevd_bundle.pydevd_extension_api import TypeResolveProvider, StrPresentationProvider
from _pydevd_bundle.pydevd_utils import isinstance_checked, hasattr_checked, DAPGrouper
from _pydevd_bundle.pydevd_utils import isinstance_checked, hasattr_checked, DAPGrouper, Timer
from _pydevd_bundle.pydevd_resolver import get_var_scope
try:
@ -218,7 +218,10 @@ class TypeResolveHandler(object):
for provider in self._str_providers:
if provider.can_provide(type_object, type_name):
self._type_to_str_provider_cache[type_object] = provider
return provider.get_str(o)
try:
return provider.get_str(o)
except:
pydev_log.exception("Error when getting str with custom provider: %s." % (provider,))
self._type_to_str_provider_cache[type_object] = self.NO_PROVIDER
return None

View file

@ -3,25 +3,19 @@ from _pydevd_bundle.pydevd_resolver import defaultResolver, MAX_ITEMS_TO_HANDLE,
from .pydevd_helpers import find_mod_attr
# =======================================================================================================================
# NdArrayResolver
# =======================================================================================================================
class NdArrayResolver: pass
class NdArrayItemsContainer: pass
class NdArrayItemsContainer(object):
pass
class NDArrayTypeResolveProvider(object):
'''
This resolves a numpy ndarray returning some metadata about the NDArray
'''
def can_provide(self, type_object, type_name):
nd_array = find_mod_attr('numpy', 'ndarray')
return nd_array is not None and issubclass(type_object, nd_array)
'''
This resolves a numpy ndarray returning some metadata about the NDArray
'''
def is_numeric(self, obj):
if not hasattr(obj, 'dtype'):
return False

View file

@ -0,0 +1,133 @@
import sys
from _pydevd_bundle.pydevd_constants import PANDAS_MAX_ROWS, PANDAS_MAX_COLS, PANDAS_MAX_COLWIDTH
from _pydevd_bundle.pydevd_extension_api import TypeResolveProvider, StrPresentationProvider
from _pydevd_bundle.pydevd_resolver import inspect, MethodWrapperType
from _pydevd_bundle.pydevd_utils import Timer
from .pydevd_helpers import find_mod_attr
def _get_dictionary(obj, replacements):
ret = dict()
cls = obj.__class__
for attr_name in dir(obj):
# This is interesting but it actually hides too much info from the dataframe.
# attr_type_in_cls = type(getattr(cls, attr_name, None))
# if attr_type_in_cls == property:
# ret[attr_name] = '<property (not computed)>'
# continue
timer = Timer()
try:
replacement = replacements.get(attr_name)
if replacement is not None:
ret[attr_name] = replacement
continue
attr_value = getattr(obj, attr_name, '<unable to get>')
if inspect.isroutine(attr_value) or isinstance(attr_value, MethodWrapperType):
continue
ret[attr_name] = attr_value
except Exception as e:
ret[attr_name] = '<error getting: %s>' % (e,)
finally:
timer.report_if_getting_attr_slow(cls, attr_name)
return ret
class PandasDataFrameTypeResolveProvider(object):
def can_provide(self, type_object, type_name):
data_frame_class = find_mod_attr('pandas.core.frame', 'DataFrame')
return data_frame_class is not None and issubclass(type_object, data_frame_class)
def resolve(self, obj, attribute):
return getattr(obj, attribute)
def get_dictionary(self, obj):
replacements = {
# This actually calls: DataFrame.transpose(), which can be expensive, so,
# let's just add some string representation for it.
'T': '<transposed dataframe -- debugger:skipped eval>',
# This creates a whole new dict{index: Series) for each column. Doing a
# subsequent repr() from this dict can be very slow, so, don't return it.
'_series': '<dict[index:Series] -- debugger:skipped eval>',
'style': '<pandas.io.formats.style.Styler -- debugger: skipped eval>',
}
return _get_dictionary(obj, replacements)
def get_str(self, df):
# The default repr depends on the settings of:
# pandas.set_option('display.max_columns', None)
# pandas.set_option('display.max_rows', None)
# which can make the repr **very** slow on some cases, so, let's use a
# version which
return df.to_string(
max_rows=PANDAS_MAX_ROWS,
max_cols=PANDAS_MAX_COLS,
max_colwidth=PANDAS_MAX_COLWIDTH,
show_dimensions=True,
)
class PandasSeriesTypeResolveProvider(object):
def can_provide(self, type_object, type_name):
series_class = find_mod_attr('pandas.core.series', 'Series')
return series_class is not None and issubclass(type_object, series_class)
def resolve(self, obj, attribute):
return getattr(obj, attribute)
def get_dictionary(self, obj):
replacements = {
# This actually calls: DataFrame.transpose(), which can be expensive, so,
# let's just add some string representation for it.
'T': '<transposed dataframe -- debugger:skipped eval>',
# This creates a whole new dict{index: Series) for each column. Doing a
# subsequent repr() from this dict can be very slow, so, don't return it.
'_series': '<dict[index:Series] -- debugger:skipped eval>',
'style': '<pandas.io.formats.style.Styler -- debugger: skipped eval>',
}
return _get_dictionary(obj, replacements)
def get_str(self, series):
return (series.to_string(
max_rows=PANDAS_MAX_ROWS,
))
class PandasStylerTypeResolveProvider(object):
def can_provide(self, type_object, type_name):
series_class = find_mod_attr('pandas.io.formats.style', 'Styler')
return series_class is not None and issubclass(type_object, series_class)
def resolve(self, obj, attribute):
return getattr(obj, attribute)
def get_dictionary(self, obj):
replacements = {
'data': '<Styler data -- debugger:skipped eval>',
'__dict__': '<dict -- debugger: skipped eval>',
}
return _get_dictionary(obj, replacements)
if not sys.platform.startswith("java"):
TypeResolveProvider.register(PandasDataFrameTypeResolveProvider)
StrPresentationProvider.register(PandasDataFrameTypeResolveProvider)
TypeResolveProvider.register(PandasSeriesTypeResolveProvider)
StrPresentationProvider.register(PandasSeriesTypeResolveProvider)
TypeResolveProvider.register(PandasStylerTypeResolveProvider)

View file

@ -5800,6 +5800,86 @@ def test_function_breakpoints_async(case_setup):
writer.finished_ok = True
try:
import pandas
except:
pandas = None
@pytest.mark.skipif(pandas is None, reason='Pandas not installed.')
def test_pandas(case_setup, pyfile):
@pyfile
def pandas_mod():
import pandas as pd
import numpy as np
rows = 5000
cols = 50
# i.e.: even with these setting our repr will print at most 300 lines/cols by default.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
items = rows * cols
df = pd.DataFrame(np.arange(items).reshape(rows, cols)).applymap(lambda x: 'Test String')
series = df._series[0]
styler = df.style
print('TEST SUCEEDED') # Break here
with case_setup.test_file(pandas_mod) as writer:
json_facade = JsonFacade(writer)
json_facade.write_launch(justMyCode=False)
bp = writer.get_line_index_with_content('Break here')
json_facade.write_set_breakpoints([bp])
json_facade.write_make_initial_run()
json_hit = json_facade.wait_for_thread_stopped()
# json_hit = json_facade.get_stack_as_json_hit(json_hit.thread_id)
name_to_var = json_facade.get_locals_name_to_var(json_hit.frame_id)
# Check the custom repr(DataFrame)
assert name_to_var['df'].value.count('\n') == 303
assert '...' in name_to_var['df'].value
# Check the custom repr(Series)
assert name_to_var['series'].value.count('\n') == 300
assert '...' in name_to_var['series'].value
# Check custom listing (DataFrame)
df_variables_response = json_facade.get_variables_response(name_to_var['df'].variablesReference)
for v in df_variables_response.body.variables:
if v['name'] == 'T':
assert v['value'] == "'<transposed dataframe -- debugger:skipped eval>'"
break
else:
raise AssertionError('Did not find variable "T".')
# Check custom listing (Series)
df_variables_response = json_facade.get_variables_response(name_to_var['series'].variablesReference)
for v in df_variables_response.body.variables:
if v['name'] == 'T':
assert v['value'] == "'<transposed dataframe -- debugger:skipped eval>'"
break
else:
raise AssertionError('Did not find variable "T".')
# Check custom listing (Styler)
df_variables_response = json_facade.get_variables_response(name_to_var['styler'].variablesReference)
for v in df_variables_response.body.variables:
if v['name'] == 'data':
assert v['value'] == "'<Styler data -- debugger:skipped eval>'"
break
else:
raise AssertionError('Did not find variable "data".')
json_facade.write_continue()
writer.finished_ok = True
if __name__ == '__main__':
pytest.main(['-k', 'test_case_skipping_filters', '-s'])