Customize pandas representation so that it doesn't freeze anymore. Fixes #695

2025-12-23 08:48:12 +00:00 · 2021-11-18 14:44:14 -03:00 · 2021-11-18 14:44:14 -03:00 · 2a8758d18f
commit 2a8758d18f
parent 30c150af91
10 changed files with 310 additions and 17 deletions
--- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_comm.py
+++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_comm.py
@ -108,7 +108,7 @@ from _pydevd_bundle import pydevd_vm_type
 import sys
 import traceback
 from _pydevd_bundle.pydevd_utils import quote_smart as quote, compare_object_attrs_key, \
-    notify_about_gevent_if_needed, isinstance_checked, ScopeRequest, getattr_checked
+    notify_about_gevent_if_needed, isinstance_checked, ScopeRequest, getattr_checked, Timer
 from _pydev_bundle import pydev_log, fsnotify
 from _pydev_bundle.pydev_log import exception as pydev_log_exception
 from _pydev_bundle import _pydev_completer
@ -821,10 +821,12 @@ class InternalGetVariable(InternalThreadCommand):
            if not (_typeName == "OrderedDict" or val_dict.__class__.__name__ == "OrderedDict" or IS_PY36_OR_GREATER):
                keys.sort(key=compare_object_attrs_key)

+            timer = Timer()
            for k in keys:
                val = val_dict[k]
                evaluate_full_value = pydevd_xml.should_evaluate_full_value(val)
                xml.write(pydevd_xml.var_to_xml(val, k, evaluate_full_value=evaluate_full_value))
+                timer.report_if_compute_repr_attr_slow(self.attributes, k, type(val))

            xml.write("</xml>")
            cmd = dbg.cmd_factory.make_get_variable_message(self.sequence, xml.getvalue())
@ -1256,6 +1258,7 @@ def internal_evaluate_expression_json(py_db, request, thread_id):
    variables_response = pydevd_base_schema.build_response(request, kwargs={'body':body})
    py_db.writer.add_command(NetCommand(CMD_RETURN, 0, variables_response, is_json=True))

+
 def _evaluate_response_return_exception(py_db, request, exc_type, exc, initial_tb):
    try:
        tb = initial_tb
--- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_constants.py
+++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_constants.py
@ -249,6 +249,18 @@ def as_float_in_env(env_key, default):
                env_key, value))


+def as_int_in_env(env_key, default):
+    value = os.getenv(env_key)
+    if value is None:
+        return default
+    try:
+        return int(value)
+    except Exception:
+        raise RuntimeError(
+            'Error: expected the env variable: %s to be set to a int value. Found: %s' % (
+                env_key, value))
+
+
 # If true in env, use gevent mode.
 SUPPORT_GEVENT = is_true_in_env('GEVENT_SUPPORT')

@ -293,6 +305,14 @@ NEXT_VALUE_SEPARATOR = "__pydev_val__"
 BUILTINS_MODULE_NAME = '__builtin__' if IS_PY2 else 'builtins'
 SHOW_DEBUG_INFO_ENV = is_true_in_env(('PYCHARM_DEBUG', 'PYDEV_DEBUG', 'PYDEVD_DEBUG'))

+# Pandas customization.
+PANDAS_MAX_ROWS = as_int_in_env('PYDEVD_PANDAS_MAX_ROWS', 300)
+PANDAS_MAX_COLS = as_int_in_env('PYDEVD_PANDAS_MAX_COLS', 300)
+PANDAS_MAX_COLWIDTH = as_int_in_env('PYDEVD_PANDAS_MAX_COLWIDTH', 80)
+
+# If getting an attribute or computing some value is too slow, let the user know if the given timeout elapses.
+PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT = as_float_in_env('PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT', 0.15)
+
 # This timeout is used to track the time to send a message saying that the evaluation
 # is taking too long and possible mitigations.
 PYDEVD_WARN_EVALUATION_TIMEOUT = as_float_in_env('PYDEVD_WARN_EVALUATION_TIMEOUT', 3.)
--- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_dont_trace_files.py
+++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_dont_trace_files.py
@ -123,6 +123,7 @@ DONT_TRACE = {
    'pydevd_net_command_factory_json.py': PYDEV_FILE,
    'pydevd_net_command_factory_xml.py': PYDEV_FILE,
    'pydevd_plugin_numpy_types.py': PYDEV_FILE,
+    'pydevd_plugin_pandas_types.py': PYDEV_FILE,
    'pydevd_plugin_utils.py': PYDEV_FILE,
    'pydevd_plugins_django_form_str.py': PYDEV_FILE,
    'pydevd_process_net_command.py': PYDEV_FILE,
--- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_resolver.py
+++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_resolver.py
@ -1,5 +1,5 @@
 from _pydev_bundle import pydev_log
-from _pydevd_bundle.pydevd_utils import hasattr_checked, DAPGrouper
+from _pydevd_bundle.pydevd_utils import hasattr_checked, DAPGrouper, Timer
 try:
    import StringIO
 except:
@ -183,6 +183,8 @@ class DefaultResolver:
        # optimize the operation by removing as many items as possible in the
        # first filters, leaving fewer items for later filters

+        timer = Timer()
+        cls = type(var)
        for name in names:
            try:
                name_as_str = name
@ -204,6 +206,9 @@ class DefaultResolver:
                traceback.print_exc(file=strIO)
                attr = strIO.getvalue()

+            finally:
+                timer.report_if_getting_attr_slow(cls, name_as_str)
+
            d[name_as_str] = attr

        return d, used___dict__
--- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_suspended_frames.py
+++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_suspended_frames.py
@ -11,7 +11,7 @@ from _pydev_bundle import pydev_log
 from _pydevd_bundle import pydevd_vars
 from _pydev_bundle.pydev_imports import Exec
 from _pydevd_bundle.pydevd_frame_utils import FramesList
-from _pydevd_bundle.pydevd_utils import ScopeRequest, DAPGrouper
+from _pydevd_bundle.pydevd_utils import ScopeRequest, DAPGrouper, Timer


 class _AbstractVariable(object):
@ -40,6 +40,7 @@ class _AbstractVariable(object):
        :param dict fmt:
            Format expected by the DAP (keys: 'hex': bool, 'rawString': bool)
        '''
+        timer = Timer()
        safe_repr = SafeRepr()
        if fmt is not None:
            safe_repr.convert_to_hex = fmt.get('hex', False)
@ -89,6 +90,7 @@ class _AbstractVariable(object):
        if len(attributes) > 0:
            var_data['presentationHint'] = {'attributes': attributes}

+        timer.report_if_compute_repr_attr_slow('', name, type_name)
        return var_data

    def get_children_variables(self, fmt=None, scope=None):
--- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_utils.py
+++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_utils.py
@ -13,10 +13,12 @@ try:
 except:
    from urllib.parse import quote  # @UnresolvedImport

+import time
 import inspect
 import sys
 from _pydevd_bundle.pydevd_constants import IS_PY3K, USE_CUSTOM_SYS_CURRENT_FRAMES, IS_PYPY, SUPPORT_GEVENT, \
-    GEVENT_SUPPORT_NOT_SET_MSG, GENERATED_LEN_ATTR_NAME
+    GEVENT_SUPPORT_NOT_SET_MSG, GENERATED_LEN_ATTR_NAME, PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT, \
+    get_global_debugger
 from _pydev_imps._pydev_saved_modules import threading


@ -454,3 +456,53 @@ def interrupt_main_thread(main_thread):
                main_thread._thread.interrupt()  # Jython
        except:
            pydev_log.exception('Error on interrupt main thread fallback.')
+
+
+class Timer(object):
+
+    def __init__(self, min_diff=PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT):
+        self.min_diff = min_diff
+        self._curr_time = time.time()
+
+    def print_time(self, msg='Elapsed:'):
+        old = self._curr_time
+        new = self._curr_time = time.time()
+        diff = new - old
+        if diff >= self.min_diff:
+            print('%s: %.2fs' % (msg, diff))
+
+    def _report_slow(self, compute_msg, *args):
+        old = self._curr_time
+        new = self._curr_time = time.time()
+        diff = new - old
+        if diff >= self.min_diff:
+            py_db = get_global_debugger()
+            if py_db is not None:
+                msg = compute_msg(diff, *args)
+                py_db.writer.add_command(py_db.cmd_factory.make_warning_message(msg))
+
+    def report_if_compute_repr_attr_slow(self, attrs_tab_separated, attr_name, attr_type):
+        self._report_slow(self._compute_repr_slow, attrs_tab_separated, attr_name, attr_type)
+
+    def _compute_repr_slow(self, diff, attrs_tab_separated, attr_name, attr_type):
+        try:
+            attr_type = attr_type.__name__
+        except:
+            pass
+        if attrs_tab_separated:
+            return 'pydevd warning: Computing repr of %s.%s (%s) was slow (took %.2fs)\n' % (
+                attrs_tab_separated.replace('\t', '.'), attr_name, attr_type, diff)
+        else:
+            return 'pydevd warning: Computing repr of %s (%s) was slow (took %.2fs)\n' % (
+                attr_name, attr_type, diff)
+
+    def report_if_getting_attr_slow(self, cls, attr_name):
+        self._report_slow(self._compute_get_attr_slow, cls, attr_name)
+
+    def _compute_get_attr_slow(self, diff, cls, attr_name):
+        try:
+            cls = cls.__name__
+        except:
+            pass
+        return 'pydevd warning: Getting attribute %s.%s was slow (took %.2fs)\n' % (cls, attr_name, diff)
+
--- a/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_xml.py
+++ b/src/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_xml.py
@ -7,7 +7,7 @@ from _pydevd_bundle.pydevd_constants import dict_iter_items, dict_keys, IS_PY3K,
    DEFAULT_VALUE
 from _pydev_bundle.pydev_imports import quote
 from _pydevd_bundle.pydevd_extension_api import TypeResolveProvider, StrPresentationProvider
-from _pydevd_bundle.pydevd_utils import isinstance_checked, hasattr_checked, DAPGrouper
+from _pydevd_bundle.pydevd_utils import isinstance_checked, hasattr_checked, DAPGrouper, Timer
 from _pydevd_bundle.pydevd_resolver import get_var_scope

 try:
@ -218,7 +218,10 @@ class TypeResolveHandler(object):
        for provider in self._str_providers:
            if provider.can_provide(type_object, type_name):
                self._type_to_str_provider_cache[type_object] = provider
-                return provider.get_str(o)
+                try:
+                    return provider.get_str(o)
+                except:
+                    pydev_log.exception("Error when getting str with custom provider: %s." % (provider,))

        self._type_to_str_provider_cache[type_object] = self.NO_PROVIDER
        return None
--- a/src/debugpy/_vendored/pydevd/pydevd_plugins/extensions/types/pydevd_plugin_numpy_types.py
+++ b/src/debugpy/_vendored/pydevd/pydevd_plugins/extensions/types/pydevd_plugin_numpy_types.py
@ -3,25 +3,19 @@ from _pydevd_bundle.pydevd_resolver import defaultResolver, MAX_ITEMS_TO_HANDLE,
 from .pydevd_helpers import find_mod_attr


-# =======================================================================================================================
-# NdArrayResolver
-# =======================================================================================================================
-class NdArrayResolver: pass
-
-
-class NdArrayItemsContainer: pass
+class NdArrayItemsContainer(object):
+    pass


 class NDArrayTypeResolveProvider(object):
+    '''
+    This resolves a numpy ndarray returning some metadata about the NDArray
+    '''

    def can_provide(self, type_object, type_name):
        nd_array = find_mod_attr('numpy', 'ndarray')
        return nd_array is not None and issubclass(type_object, nd_array)

-    '''
-       This resolves a numpy ndarray returning some metadata about the NDArray
-   '''
-
    def is_numeric(self, obj):
        if not hasattr(obj, 'dtype'):
            return False
--- a/src/debugpy/_vendored/pydevd/pydevd_plugins/extensions/types/pydevd_plugin_pandas_types.py
+++ b/src/debugpy/_vendored/pydevd/pydevd_plugins/extensions/types/pydevd_plugin_pandas_types.py
@ -0,0 +1,133 @@
+import sys
+
+from _pydevd_bundle.pydevd_constants import PANDAS_MAX_ROWS, PANDAS_MAX_COLS, PANDAS_MAX_COLWIDTH
+from _pydevd_bundle.pydevd_extension_api import TypeResolveProvider, StrPresentationProvider
+from _pydevd_bundle.pydevd_resolver import inspect, MethodWrapperType
+from _pydevd_bundle.pydevd_utils import Timer
+
+from .pydevd_helpers import find_mod_attr
+
+
+def _get_dictionary(obj, replacements):
+    ret = dict()
+    cls = obj.__class__
+    for attr_name in dir(obj):
+
+        # This is interesting but it actually hides too much info from the dataframe.
+        # attr_type_in_cls = type(getattr(cls, attr_name, None))
+        # if attr_type_in_cls == property:
+        #     ret[attr_name] = '<property (not computed)>'
+        #     continue
+
+        timer = Timer()
+        try:
+            replacement = replacements.get(attr_name)
+            if replacement is not None:
+                ret[attr_name] = replacement
+                continue
+
+            attr_value = getattr(obj, attr_name, '<unable to get>')
+            if inspect.isroutine(attr_value) or isinstance(attr_value, MethodWrapperType):
+                continue
+            ret[attr_name] = attr_value
+        except Exception as e:
+            ret[attr_name] = '<error getting: %s>' % (e,)
+        finally:
+            timer.report_if_getting_attr_slow(cls, attr_name)
+
+    return ret
+
+
+class PandasDataFrameTypeResolveProvider(object):
+
+    def can_provide(self, type_object, type_name):
+        data_frame_class = find_mod_attr('pandas.core.frame', 'DataFrame')
+        return data_frame_class is not None and issubclass(type_object, data_frame_class)
+
+    def resolve(self, obj, attribute):
+        return getattr(obj, attribute)
+
+    def get_dictionary(self, obj):
+        replacements = {
+            # This actually calls: DataFrame.transpose(), which can be expensive, so,
+            # let's just add some string representation for it.
+            'T': '<transposed dataframe -- debugger:skipped eval>',
+
+            # This creates a whole new dict{index: Series) for each column. Doing a
+            # subsequent repr() from this dict can be very slow, so, don't return it.
+            '_series': '<dict[index:Series] -- debugger:skipped eval>',
+
+            'style': '<pandas.io.formats.style.Styler -- debugger: skipped eval>',
+        }
+        return _get_dictionary(obj, replacements)
+
+    def get_str(self, df):
+        # The default repr depends on the settings of:
+        # pandas.set_option('display.max_columns', None)
+        # pandas.set_option('display.max_rows', None)
+        # which can make the repr **very** slow on some cases, so, let's use a
+        # version which
+
+        return df.to_string(
+            max_rows=PANDAS_MAX_ROWS,
+            max_cols=PANDAS_MAX_COLS,
+            max_colwidth=PANDAS_MAX_COLWIDTH,
+            show_dimensions=True,
+        )
+
+
+class PandasSeriesTypeResolveProvider(object):
+
+    def can_provide(self, type_object, type_name):
+        series_class = find_mod_attr('pandas.core.series', 'Series')
+        return series_class is not None and issubclass(type_object, series_class)
+
+    def resolve(self, obj, attribute):
+        return getattr(obj, attribute)
+
+    def get_dictionary(self, obj):
+        replacements = {
+            # This actually calls: DataFrame.transpose(), which can be expensive, so,
+            # let's just add some string representation for it.
+            'T': '<transposed dataframe -- debugger:skipped eval>',
+
+            # This creates a whole new dict{index: Series) for each column. Doing a
+            # subsequent repr() from this dict can be very slow, so, don't return it.
+            '_series': '<dict[index:Series] -- debugger:skipped eval>',
+
+            'style': '<pandas.io.formats.style.Styler -- debugger: skipped eval>',
+        }
+        return _get_dictionary(obj, replacements)
+
+    def get_str(self, series):
+        return (series.to_string(
+            max_rows=PANDAS_MAX_ROWS,
+        ))
+
+
+class PandasStylerTypeResolveProvider(object):
+
+    def can_provide(self, type_object, type_name):
+        series_class = find_mod_attr('pandas.io.formats.style', 'Styler')
+        return series_class is not None and issubclass(type_object, series_class)
+
+    def resolve(self, obj, attribute):
+        return getattr(obj, attribute)
+
+    def get_dictionary(self, obj):
+        replacements = {
+            'data': '<Styler data -- debugger:skipped eval>',
+
+            '__dict__': '<dict -- debugger: skipped eval>',
+        }
+        return _get_dictionary(obj, replacements)
+
+
+if not sys.platform.startswith("java"):
+    TypeResolveProvider.register(PandasDataFrameTypeResolveProvider)
+    StrPresentationProvider.register(PandasDataFrameTypeResolveProvider)
+
+    TypeResolveProvider.register(PandasSeriesTypeResolveProvider)
+    StrPresentationProvider.register(PandasSeriesTypeResolveProvider)
+
+    TypeResolveProvider.register(PandasStylerTypeResolveProvider)
--- a/src/debugpy/_vendored/pydevd/tests_python/test_debugger_json.py
+++ b/src/debugpy/_vendored/pydevd/tests_python/test_debugger_json.py
@ -5800,6 +5800,86 @@ def test_function_breakpoints_async(case_setup):
        writer.finished_ok = True


+try:
+    import pandas
+except:
+    pandas = None
+
+
+@pytest.mark.skipif(pandas is None, reason='Pandas not installed.')
+def test_pandas(case_setup, pyfile):
+
+    @pyfile
+    def pandas_mod():
+        import pandas as pd
+        import numpy as np
+
+        rows = 5000
+        cols = 50
+
+        # i.e.: even with these setting our repr will print at most 300 lines/cols by default.
+        pd.set_option('display.max_columns', None)
+        pd.set_option('display.max_rows', None)
+
+        items = rows * cols
+        df = pd.DataFrame(np.arange(items).reshape(rows, cols)).applymap(lambda x: 'Test String')
+        series = df._series[0]
+        styler = df.style
+
+        print('TEST SUCEEDED')  # Break here
+
+    with case_setup.test_file(pandas_mod) as writer:
+        json_facade = JsonFacade(writer)
+        json_facade.write_launch(justMyCode=False)
+
+        bp = writer.get_line_index_with_content('Break here')
+        json_facade.write_set_breakpoints([bp])
+
+        json_facade.write_make_initial_run()
+
+        json_hit = json_facade.wait_for_thread_stopped()
+        # json_hit = json_facade.get_stack_as_json_hit(json_hit.thread_id)
+        name_to_var = json_facade.get_locals_name_to_var(json_hit.frame_id)
+
+        # Check the custom repr(DataFrame)
+        assert name_to_var['df'].value.count('\n') == 303
+        assert '...' in name_to_var['df'].value
+
+        # Check the custom repr(Series)
+        assert name_to_var['series'].value.count('\n') == 300
+        assert '...' in name_to_var['series'].value
+
+        # Check custom listing (DataFrame)
+        df_variables_response = json_facade.get_variables_response(name_to_var['df'].variablesReference)
+        for v in df_variables_response.body.variables:
+            if v['name'] == 'T':
+                assert v['value'] == "'<transposed dataframe -- debugger:skipped eval>'"
+                break
+        else:
+            raise AssertionError('Did not find variable "T".')
+
+        # Check custom listing (Series)
+        df_variables_response = json_facade.get_variables_response(name_to_var['series'].variablesReference)
+        for v in df_variables_response.body.variables:
+            if v['name'] == 'T':
+                assert v['value'] == "'<transposed dataframe -- debugger:skipped eval>'"
+                break
+        else:
+            raise AssertionError('Did not find variable "T".')
+
+        # Check custom listing (Styler)
+        df_variables_response = json_facade.get_variables_response(name_to_var['styler'].variablesReference)
+        for v in df_variables_response.body.variables:
+            if v['name'] == 'data':
+                assert v['value'] == "'<Styler data -- debugger:skipped eval>'"
+                break
+        else:
+            raise AssertionError('Did not find variable "data".')
+
+        json_facade.write_continue()
+        writer.finished_ok = True
+
+
 if __name__ == '__main__':
    pytest.main(['-k', 'test_case_skipping_filters', '-s'])