mirror of
https://github.com/python/cpython.git
synced 2025-09-27 10:50:04 +00:00
bpo-43625: Enhance csv sniffer has_headers() to be more accurate (GH-26939)
This commit is contained in:
parent
e3f877c32d
commit
ceea579ccc
4 changed files with 56 additions and 8 deletions
|
@ -269,6 +269,20 @@ The :mod:`csv` module defines the following classes:
|
||||||
|
|
||||||
Analyze the sample text (presumed to be in CSV format) and return
|
Analyze the sample text (presumed to be in CSV format) and return
|
||||||
:const:`True` if the first row appears to be a series of column headers.
|
:const:`True` if the first row appears to be a series of column headers.
|
||||||
|
Inspecting each column, one of two key criteria will be considered to
|
||||||
|
estimate if the sample contains a header:
|
||||||
|
|
||||||
|
- the second through n-th rows contain numeric values
|
||||||
|
- the second through n-th rows contain strings where at least one value's
|
||||||
|
length differs from that of the putative header of that column.
|
||||||
|
|
||||||
|
Twenty rows after the first row are sampled; if more than half of columns +
|
||||||
|
rows meet the criteria, :const:`True` is returned.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This method is a rough heuristic and may produce both false positives and
|
||||||
|
negatives.
|
||||||
|
|
||||||
An example for :class:`Sniffer` use::
|
An example for :class:`Sniffer` use::
|
||||||
|
|
||||||
|
|
12
Lib/csv.py
12
Lib/csv.py
|
@ -409,14 +409,10 @@ class Sniffer:
|
||||||
continue # skip rows that have irregular number of columns
|
continue # skip rows that have irregular number of columns
|
||||||
|
|
||||||
for col in list(columnTypes.keys()):
|
for col in list(columnTypes.keys()):
|
||||||
|
thisType = complex
|
||||||
for thisType in [int, float, complex]:
|
try:
|
||||||
try:
|
thisType(row[col])
|
||||||
thisType(row[col])
|
except (ValueError, OverflowError):
|
||||||
break
|
|
||||||
except (ValueError, OverflowError):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# fallback to length of string
|
# fallback to length of string
|
||||||
thisType = len(row[col])
|
thisType = len(row[col])
|
||||||
|
|
||||||
|
|
|
@ -1020,6 +1020,42 @@ Stonecutters Seafood and Chop House+ Lemont+ IL+ 12/19/02+ Week Back
|
||||||
'Stonecutters ''Seafood'' and Chop House'+ 'Lemont'+ 'IL'+ '12/19/02'+ 'Week Back'
|
'Stonecutters ''Seafood'' and Chop House'+ 'Lemont'+ 'IL'+ '12/19/02'+ 'Week Back'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
sample10 = dedent("""
|
||||||
|
abc,def
|
||||||
|
ghijkl,mno
|
||||||
|
ghi,jkl
|
||||||
|
""")
|
||||||
|
|
||||||
|
sample11 = dedent("""
|
||||||
|
abc,def
|
||||||
|
ghijkl,mnop
|
||||||
|
ghi,jkl
|
||||||
|
""")
|
||||||
|
|
||||||
|
sample12 = dedent(""""time","forces"
|
||||||
|
1,1.5
|
||||||
|
0.5,5+0j
|
||||||
|
0,0
|
||||||
|
1+1j,6
|
||||||
|
""")
|
||||||
|
|
||||||
|
sample13 = dedent(""""time","forces"
|
||||||
|
0,0
|
||||||
|
1,2
|
||||||
|
a,b
|
||||||
|
""")
|
||||||
|
|
||||||
|
def test_issue43625(self):
|
||||||
|
sniffer = csv.Sniffer()
|
||||||
|
self.assertTrue(sniffer.has_header(self.sample12))
|
||||||
|
self.assertFalse(sniffer.has_header(self.sample13))
|
||||||
|
|
||||||
|
def test_has_header_strings(self):
|
||||||
|
"More to document existing (unexpected?) behavior than anything else."
|
||||||
|
sniffer = csv.Sniffer()
|
||||||
|
self.assertFalse(sniffer.has_header(self.sample10))
|
||||||
|
self.assertFalse(sniffer.has_header(self.sample11))
|
||||||
|
|
||||||
def test_has_header(self):
|
def test_has_header(self):
|
||||||
sniffer = csv.Sniffer()
|
sniffer = csv.Sniffer()
|
||||||
self.assertIs(sniffer.has_header(self.sample1), False)
|
self.assertIs(sniffer.has_header(self.sample1), False)
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Fix a bug in the detection of CSV file headers by
|
||||||
|
:meth:`csv.Sniffer.has_header` and improve documentation of same.
|
Loading…
Add table
Add a link
Reference in a new issue