From 2a8758d18f0161cc23206c088ad29721bd2b68b5 Mon Sep 17 00:00:00 2001 From: Fabio Zadrozny Date: Thu, 18 Nov 2021 14:44:14 -0300 Subject: [PATCH] Customize pandas representation so that it doesn't freeze anymore. Fixes #695 --- .../pydevd/_pydevd_bundle/pydevd_comm.py | 5 +- .../pydevd/_pydevd_bundle/pydevd_constants.py | 20 +++ .../_pydevd_bundle/pydevd_dont_trace_files.py | 1 + .../pydevd/_pydevd_bundle/pydevd_resolver.py | 7 +- .../_pydevd_bundle/pydevd_suspended_frames.py | 4 +- .../pydevd/_pydevd_bundle/pydevd_utils.py | 54 ++++++- .../pydevd/_pydevd_bundle/pydevd_xml.py | 7 +- .../types/pydevd_plugin_numpy_types.py | 16 +-- .../types/pydevd_plugin_pandas_types.py | 133 ++++++++++++++++++ .../pydevd/tests_python/test_debugger_json.py | 80 +++++++++++ 10 files changed, 310 insertions(+), 17 deletions(-) create mode 100644 src/debugpy/_vendored/pydevd/pydevd_plugins/extensions/types/pydevd_plugin_pandas_types.py diff --git a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_comm.py b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_comm.py index 917e3a5c..ba6cf9f1 100644 --- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_comm.py +++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_comm.py @@ -108,7 +108,7 @@ from _pydevd_bundle import pydevd_vm_type import sys import traceback from _pydevd_bundle.pydevd_utils import quote_smart as quote, compare_object_attrs_key, \ - notify_about_gevent_if_needed, isinstance_checked, ScopeRequest, getattr_checked + notify_about_gevent_if_needed, isinstance_checked, ScopeRequest, getattr_checked, Timer from _pydev_bundle import pydev_log, fsnotify from _pydev_bundle.pydev_log import exception as pydev_log_exception from _pydev_bundle import _pydev_completer @@ -821,10 +821,12 @@ class InternalGetVariable(InternalThreadCommand): if not (_typeName == "OrderedDict" or val_dict.__class__.__name__ == "OrderedDict" or IS_PY36_OR_GREATER): keys.sort(key=compare_object_attrs_key) + timer = Timer() for k in keys: val = val_dict[k] evaluate_full_value = pydevd_xml.should_evaluate_full_value(val) xml.write(pydevd_xml.var_to_xml(val, k, evaluate_full_value=evaluate_full_value)) + timer.report_if_compute_repr_attr_slow(self.attributes, k, type(val)) xml.write("") cmd = dbg.cmd_factory.make_get_variable_message(self.sequence, xml.getvalue()) @@ -1256,6 +1258,7 @@ def internal_evaluate_expression_json(py_db, request, thread_id): variables_response = pydevd_base_schema.build_response(request, kwargs={'body':body}) py_db.writer.add_command(NetCommand(CMD_RETURN, 0, variables_response, is_json=True)) + def _evaluate_response_return_exception(py_db, request, exc_type, exc, initial_tb): try: tb = initial_tb diff --git a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_constants.py b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_constants.py index 97df9fe9..d69b2209 100644 --- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_constants.py +++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_constants.py @@ -249,6 +249,18 @@ def as_float_in_env(env_key, default): env_key, value)) +def as_int_in_env(env_key, default): + value = os.getenv(env_key) + if value is None: + return default + try: + return int(value) + except Exception: + raise RuntimeError( + 'Error: expected the env variable: %s to be set to a int value. Found: %s' % ( + env_key, value)) + + # If true in env, use gevent mode. SUPPORT_GEVENT = is_true_in_env('GEVENT_SUPPORT') @@ -293,6 +305,14 @@ NEXT_VALUE_SEPARATOR = "__pydev_val__" BUILTINS_MODULE_NAME = '__builtin__' if IS_PY2 else 'builtins' SHOW_DEBUG_INFO_ENV = is_true_in_env(('PYCHARM_DEBUG', 'PYDEV_DEBUG', 'PYDEVD_DEBUG')) +# Pandas customization. +PANDAS_MAX_ROWS = as_int_in_env('PYDEVD_PANDAS_MAX_ROWS', 300) +PANDAS_MAX_COLS = as_int_in_env('PYDEVD_PANDAS_MAX_COLS', 300) +PANDAS_MAX_COLWIDTH = as_int_in_env('PYDEVD_PANDAS_MAX_COLWIDTH', 80) + +# If getting an attribute or computing some value is too slow, let the user know if the given timeout elapses. +PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT = as_float_in_env('PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT', 0.15) + # This timeout is used to track the time to send a message saying that the evaluation # is taking too long and possible mitigations. PYDEVD_WARN_EVALUATION_TIMEOUT = as_float_in_env('PYDEVD_WARN_EVALUATION_TIMEOUT', 3.) diff --git a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_dont_trace_files.py b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_dont_trace_files.py index 0efb4b4b..3aab9c50 100644 --- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_dont_trace_files.py +++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_dont_trace_files.py @@ -123,6 +123,7 @@ DONT_TRACE = { 'pydevd_net_command_factory_json.py': PYDEV_FILE, 'pydevd_net_command_factory_xml.py': PYDEV_FILE, 'pydevd_plugin_numpy_types.py': PYDEV_FILE, + 'pydevd_plugin_pandas_types.py': PYDEV_FILE, 'pydevd_plugin_utils.py': PYDEV_FILE, 'pydevd_plugins_django_form_str.py': PYDEV_FILE, 'pydevd_process_net_command.py': PYDEV_FILE, diff --git a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_resolver.py b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_resolver.py index 5fa8c0dd..7c5e96cf 100644 --- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_resolver.py +++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_resolver.py @@ -1,5 +1,5 @@ from _pydev_bundle import pydev_log -from _pydevd_bundle.pydevd_utils import hasattr_checked, DAPGrouper +from _pydevd_bundle.pydevd_utils import hasattr_checked, DAPGrouper, Timer try: import StringIO except: @@ -183,6 +183,8 @@ class DefaultResolver: # optimize the operation by removing as many items as possible in the # first filters, leaving fewer items for later filters + timer = Timer() + cls = type(var) for name in names: try: name_as_str = name @@ -204,6 +206,9 @@ class DefaultResolver: traceback.print_exc(file=strIO) attr = strIO.getvalue() + finally: + timer.report_if_getting_attr_slow(cls, name_as_str) + d[name_as_str] = attr return d, used___dict__ diff --git a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_suspended_frames.py b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_suspended_frames.py index 1f911669..374bae6e 100644 --- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_suspended_frames.py +++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_suspended_frames.py @@ -11,7 +11,7 @@ from _pydev_bundle import pydev_log from _pydevd_bundle import pydevd_vars from _pydev_bundle.pydev_imports import Exec from _pydevd_bundle.pydevd_frame_utils import FramesList -from _pydevd_bundle.pydevd_utils import ScopeRequest, DAPGrouper +from _pydevd_bundle.pydevd_utils import ScopeRequest, DAPGrouper, Timer class _AbstractVariable(object): @@ -40,6 +40,7 @@ class _AbstractVariable(object): :param dict fmt: Format expected by the DAP (keys: 'hex': bool, 'rawString': bool) ''' + timer = Timer() safe_repr = SafeRepr() if fmt is not None: safe_repr.convert_to_hex = fmt.get('hex', False) @@ -89,6 +90,7 @@ class _AbstractVariable(object): if len(attributes) > 0: var_data['presentationHint'] = {'attributes': attributes} + timer.report_if_compute_repr_attr_slow('', name, type_name) return var_data def get_children_variables(self, fmt=None, scope=None): diff --git a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_utils.py b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_utils.py index 6e4af794..00867af0 100644 --- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_utils.py +++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_utils.py @@ -13,10 +13,12 @@ try: except: from urllib.parse import quote # @UnresolvedImport +import time import inspect import sys from _pydevd_bundle.pydevd_constants import IS_PY3K, USE_CUSTOM_SYS_CURRENT_FRAMES, IS_PYPY, SUPPORT_GEVENT, \ - GEVENT_SUPPORT_NOT_SET_MSG, GENERATED_LEN_ATTR_NAME + GEVENT_SUPPORT_NOT_SET_MSG, GENERATED_LEN_ATTR_NAME, PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT, \ + get_global_debugger from _pydev_imps._pydev_saved_modules import threading @@ -454,3 +456,53 @@ def interrupt_main_thread(main_thread): main_thread._thread.interrupt() # Jython except: pydev_log.exception('Error on interrupt main thread fallback.') + + +class Timer(object): + + def __init__(self, min_diff=PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT): + self.min_diff = min_diff + self._curr_time = time.time() + + def print_time(self, msg='Elapsed:'): + old = self._curr_time + new = self._curr_time = time.time() + diff = new - old + if diff >= self.min_diff: + print('%s: %.2fs' % (msg, diff)) + + def _report_slow(self, compute_msg, *args): + old = self._curr_time + new = self._curr_time = time.time() + diff = new - old + if diff >= self.min_diff: + py_db = get_global_debugger() + if py_db is not None: + msg = compute_msg(diff, *args) + py_db.writer.add_command(py_db.cmd_factory.make_warning_message(msg)) + + def report_if_compute_repr_attr_slow(self, attrs_tab_separated, attr_name, attr_type): + self._report_slow(self._compute_repr_slow, attrs_tab_separated, attr_name, attr_type) + + def _compute_repr_slow(self, diff, attrs_tab_separated, attr_name, attr_type): + try: + attr_type = attr_type.__name__ + except: + pass + if attrs_tab_separated: + return 'pydevd warning: Computing repr of %s.%s (%s) was slow (took %.2fs)\n' % ( + attrs_tab_separated.replace('\t', '.'), attr_name, attr_type, diff) + else: + return 'pydevd warning: Computing repr of %s (%s) was slow (took %.2fs)\n' % ( + attr_name, attr_type, diff) + + def report_if_getting_attr_slow(self, cls, attr_name): + self._report_slow(self._compute_get_attr_slow, cls, attr_name) + + def _compute_get_attr_slow(self, diff, cls, attr_name): + try: + cls = cls.__name__ + except: + pass + return 'pydevd warning: Getting attribute %s.%s was slow (took %.2fs)\n' % (cls, attr_name, diff) + diff --git a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_xml.py b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_xml.py index bf7c4fdd..6460d011 100644 --- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_xml.py +++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_xml.py @@ -7,7 +7,7 @@ from _pydevd_bundle.pydevd_constants import dict_iter_items, dict_keys, IS_PY3K, DEFAULT_VALUE from _pydev_bundle.pydev_imports import quote from _pydevd_bundle.pydevd_extension_api import TypeResolveProvider, StrPresentationProvider -from _pydevd_bundle.pydevd_utils import isinstance_checked, hasattr_checked, DAPGrouper +from _pydevd_bundle.pydevd_utils import isinstance_checked, hasattr_checked, DAPGrouper, Timer from _pydevd_bundle.pydevd_resolver import get_var_scope try: @@ -218,7 +218,10 @@ class TypeResolveHandler(object): for provider in self._str_providers: if provider.can_provide(type_object, type_name): self._type_to_str_provider_cache[type_object] = provider - return provider.get_str(o) + try: + return provider.get_str(o) + except: + pydev_log.exception("Error when getting str with custom provider: %s." % (provider,)) self._type_to_str_provider_cache[type_object] = self.NO_PROVIDER return None diff --git a/src/debugpy/_vendored/pydevd/pydevd_plugins/extensions/types/pydevd_plugin_numpy_types.py b/src/debugpy/_vendored/pydevd/pydevd_plugins/extensions/types/pydevd_plugin_numpy_types.py index 2f2dd434..1e3e6e90 100644 --- a/src/debugpy/_vendored/pydevd/pydevd_plugins/extensions/types/pydevd_plugin_numpy_types.py +++ b/src/debugpy/_vendored/pydevd/pydevd_plugins/extensions/types/pydevd_plugin_numpy_types.py @@ -3,25 +3,19 @@ from _pydevd_bundle.pydevd_resolver import defaultResolver, MAX_ITEMS_TO_HANDLE, from .pydevd_helpers import find_mod_attr -# ======================================================================================================================= -# NdArrayResolver -# ======================================================================================================================= -class NdArrayResolver: pass - - -class NdArrayItemsContainer: pass +class NdArrayItemsContainer(object): + pass class NDArrayTypeResolveProvider(object): + ''' + This resolves a numpy ndarray returning some metadata about the NDArray + ''' def can_provide(self, type_object, type_name): nd_array = find_mod_attr('numpy', 'ndarray') return nd_array is not None and issubclass(type_object, nd_array) - ''' - This resolves a numpy ndarray returning some metadata about the NDArray - ''' - def is_numeric(self, obj): if not hasattr(obj, 'dtype'): return False diff --git a/src/debugpy/_vendored/pydevd/pydevd_plugins/extensions/types/pydevd_plugin_pandas_types.py b/src/debugpy/_vendored/pydevd/pydevd_plugins/extensions/types/pydevd_plugin_pandas_types.py new file mode 100644 index 00000000..e6b8c739 --- /dev/null +++ b/src/debugpy/_vendored/pydevd/pydevd_plugins/extensions/types/pydevd_plugin_pandas_types.py @@ -0,0 +1,133 @@ +import sys + +from _pydevd_bundle.pydevd_constants import PANDAS_MAX_ROWS, PANDAS_MAX_COLS, PANDAS_MAX_COLWIDTH +from _pydevd_bundle.pydevd_extension_api import TypeResolveProvider, StrPresentationProvider +from _pydevd_bundle.pydevd_resolver import inspect, MethodWrapperType +from _pydevd_bundle.pydevd_utils import Timer + +from .pydevd_helpers import find_mod_attr + + +def _get_dictionary(obj, replacements): + ret = dict() + cls = obj.__class__ + for attr_name in dir(obj): + + # This is interesting but it actually hides too much info from the dataframe. + # attr_type_in_cls = type(getattr(cls, attr_name, None)) + # if attr_type_in_cls == property: + # ret[attr_name] = '' + # continue + + timer = Timer() + try: + replacement = replacements.get(attr_name) + if replacement is not None: + ret[attr_name] = replacement + continue + + attr_value = getattr(obj, attr_name, '') + if inspect.isroutine(attr_value) or isinstance(attr_value, MethodWrapperType): + continue + ret[attr_name] = attr_value + except Exception as e: + ret[attr_name] = '' % (e,) + finally: + timer.report_if_getting_attr_slow(cls, attr_name) + + return ret + + +class PandasDataFrameTypeResolveProvider(object): + + def can_provide(self, type_object, type_name): + data_frame_class = find_mod_attr('pandas.core.frame', 'DataFrame') + return data_frame_class is not None and issubclass(type_object, data_frame_class) + + def resolve(self, obj, attribute): + return getattr(obj, attribute) + + def get_dictionary(self, obj): + replacements = { + # This actually calls: DataFrame.transpose(), which can be expensive, so, + # let's just add some string representation for it. + 'T': '', + + # This creates a whole new dict{index: Series) for each column. Doing a + # subsequent repr() from this dict can be very slow, so, don't return it. + '_series': '', + + 'style': '', + } + return _get_dictionary(obj, replacements) + + def get_str(self, df): + # The default repr depends on the settings of: + # pandas.set_option('display.max_columns', None) + # pandas.set_option('display.max_rows', None) + # which can make the repr **very** slow on some cases, so, let's use a + # version which + + return df.to_string( + max_rows=PANDAS_MAX_ROWS, + max_cols=PANDAS_MAX_COLS, + max_colwidth=PANDAS_MAX_COLWIDTH, + show_dimensions=True, + ) + + +class PandasSeriesTypeResolveProvider(object): + + def can_provide(self, type_object, type_name): + series_class = find_mod_attr('pandas.core.series', 'Series') + return series_class is not None and issubclass(type_object, series_class) + + def resolve(self, obj, attribute): + return getattr(obj, attribute) + + def get_dictionary(self, obj): + replacements = { + # This actually calls: DataFrame.transpose(), which can be expensive, so, + # let's just add some string representation for it. + 'T': '', + + # This creates a whole new dict{index: Series) for each column. Doing a + # subsequent repr() from this dict can be very slow, so, don't return it. + '_series': '', + + 'style': '', + } + return _get_dictionary(obj, replacements) + + def get_str(self, series): + return (series.to_string( + max_rows=PANDAS_MAX_ROWS, + )) + + +class PandasStylerTypeResolveProvider(object): + + def can_provide(self, type_object, type_name): + series_class = find_mod_attr('pandas.io.formats.style', 'Styler') + return series_class is not None and issubclass(type_object, series_class) + + def resolve(self, obj, attribute): + return getattr(obj, attribute) + + def get_dictionary(self, obj): + replacements = { + 'data': '', + + '__dict__': '', + } + return _get_dictionary(obj, replacements) + + +if not sys.platform.startswith("java"): + TypeResolveProvider.register(PandasDataFrameTypeResolveProvider) + StrPresentationProvider.register(PandasDataFrameTypeResolveProvider) + + TypeResolveProvider.register(PandasSeriesTypeResolveProvider) + StrPresentationProvider.register(PandasSeriesTypeResolveProvider) + + TypeResolveProvider.register(PandasStylerTypeResolveProvider) diff --git a/src/debugpy/_vendored/pydevd/tests_python/test_debugger_json.py b/src/debugpy/_vendored/pydevd/tests_python/test_debugger_json.py index 5d20991b..2aaa20b4 100644 --- a/src/debugpy/_vendored/pydevd/tests_python/test_debugger_json.py +++ b/src/debugpy/_vendored/pydevd/tests_python/test_debugger_json.py @@ -5800,6 +5800,86 @@ def test_function_breakpoints_async(case_setup): writer.finished_ok = True +try: + import pandas +except: + pandas = None + + +@pytest.mark.skipif(pandas is None, reason='Pandas not installed.') +def test_pandas(case_setup, pyfile): + + @pyfile + def pandas_mod(): + import pandas as pd + import numpy as np + + rows = 5000 + cols = 50 + + # i.e.: even with these setting our repr will print at most 300 lines/cols by default. + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + + items = rows * cols + df = pd.DataFrame(np.arange(items).reshape(rows, cols)).applymap(lambda x: 'Test String') + series = df._series[0] + styler = df.style + + print('TEST SUCEEDED') # Break here + + with case_setup.test_file(pandas_mod) as writer: + json_facade = JsonFacade(writer) + json_facade.write_launch(justMyCode=False) + + bp = writer.get_line_index_with_content('Break here') + json_facade.write_set_breakpoints([bp]) + + json_facade.write_make_initial_run() + + json_hit = json_facade.wait_for_thread_stopped() + # json_hit = json_facade.get_stack_as_json_hit(json_hit.thread_id) + name_to_var = json_facade.get_locals_name_to_var(json_hit.frame_id) + + # Check the custom repr(DataFrame) + assert name_to_var['df'].value.count('\n') == 303 + assert '...' in name_to_var['df'].value + + # Check the custom repr(Series) + assert name_to_var['series'].value.count('\n') == 300 + assert '...' in name_to_var['series'].value + + # Check custom listing (DataFrame) + df_variables_response = json_facade.get_variables_response(name_to_var['df'].variablesReference) + for v in df_variables_response.body.variables: + if v['name'] == 'T': + assert v['value'] == "''" + break + else: + raise AssertionError('Did not find variable "T".') + + # Check custom listing (Series) + df_variables_response = json_facade.get_variables_response(name_to_var['series'].variablesReference) + for v in df_variables_response.body.variables: + if v['name'] == 'T': + assert v['value'] == "''" + break + else: + raise AssertionError('Did not find variable "T".') + + # Check custom listing (Styler) + df_variables_response = json_facade.get_variables_response(name_to_var['styler'].variablesReference) + for v in df_variables_response.body.variables: + if v['name'] == 'data': + assert v['value'] == "''" + break + else: + raise AssertionError('Did not find variable "data".') + + json_facade.write_continue() + writer.finished_ok = True + + if __name__ == '__main__': pytest.main(['-k', 'test_case_skipping_filters', '-s'])