mirror of
https://github.com/python/cpython.git
synced 2025-10-02 21:25:24 +00:00
* Correct Sniffer doc to correspond to the implementation.
* Add optional delimiters arg to Sniffer.sniff() which restricts the set of candidate field delimiters.
This commit is contained in:
parent
c626658a28
commit
7789237331
3 changed files with 31 additions and 14 deletions
|
@ -152,17 +152,17 @@ attributes, which are used to define the parameters for a specific
|
||||||
\class{reader} or \class{writer} instance.
|
\class{reader} or \class{writer} instance.
|
||||||
\end{classdesc*}
|
\end{classdesc*}
|
||||||
|
|
||||||
\begin{classdesc}{Sniffer}{\optional{sample=16384}}
|
\begin{classdesc}{Sniffer}{}
|
||||||
The \class{Sniffer} class is used to deduce the format of a CSV file. The
|
The \class{Sniffer} class is used to deduce the format of a CSV file.
|
||||||
optional \var{sample} argument to the constructor specifies the number of
|
|
||||||
bytes to use when determining Dialect parameters.
|
|
||||||
\end{classdesc}
|
\end{classdesc}
|
||||||
|
|
||||||
The \class{Sniffer} class provides a single method:
|
The \class{Sniffer} class provides a single method:
|
||||||
|
|
||||||
\begin{methoddesc}{sniff}{fileobj}
|
\begin{methoddesc}{sniff}{sample\optional{,delimiters=None}}
|
||||||
Analyze the next chunk of \var{fileobj} and return a \class{Dialect} subclass
|
Analyze the given \var{sample} and return a \class{Dialect} subclass
|
||||||
reflecting the parameters found.
|
reflecting the parameters found. If the optional \var{delimiters} parameter
|
||||||
|
is given, it is interpreted as a string containing possible valid delimiter
|
||||||
|
characters.
|
||||||
\end{methoddesc}
|
\end{methoddesc}
|
||||||
|
|
||||||
\begin{methoddesc}{has_header}{sample}
|
\begin{methoddesc}{has_header}{sample}
|
||||||
|
|
16
Lib/csv.py
16
Lib/csv.py
|
@ -159,15 +159,16 @@ class Sniffer:
|
||||||
self.preferred = [',', '\t', ';', ' ', ':']
|
self.preferred = [',', '\t', ';', ' ', ':']
|
||||||
|
|
||||||
|
|
||||||
def sniff(self, sample):
|
def sniff(self, sample, delimiters=None):
|
||||||
"""
|
"""
|
||||||
Returns a dialect (or None) corresponding to the sample
|
Returns a dialect (or None) corresponding to the sample
|
||||||
"""
|
"""
|
||||||
|
|
||||||
quotechar, delimiter, skipinitialspace = \
|
quotechar, delimiter, skipinitialspace = \
|
||||||
self._guess_quote_and_delimiter(sample)
|
self._guess_quote_and_delimiter(sample, delimiters)
|
||||||
if delimiter is None:
|
if delimiter is None:
|
||||||
delimiter, skipinitialspace = self._guess_delimiter(sample)
|
delimiter, skipinitialspace = self._guess_delimiter(sample,
|
||||||
|
delimiters)
|
||||||
|
|
||||||
class dialect(Dialect):
|
class dialect(Dialect):
|
||||||
_name = "sniffed"
|
_name = "sniffed"
|
||||||
|
@ -184,7 +185,7 @@ class Sniffer:
|
||||||
return dialect
|
return dialect
|
||||||
|
|
||||||
|
|
||||||
def _guess_quote_and_delimiter(self, data):
|
def _guess_quote_and_delimiter(self, data, delimiters):
|
||||||
"""
|
"""
|
||||||
Looks for text enclosed between two identical quotes
|
Looks for text enclosed between two identical quotes
|
||||||
(the probable quotechar) which are preceded and followed
|
(the probable quotechar) which are preceded and followed
|
||||||
|
@ -222,7 +223,7 @@ class Sniffer:
|
||||||
key = m[n]
|
key = m[n]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
continue
|
continue
|
||||||
if key:
|
if key and (delimiters is None or key in delimiters):
|
||||||
delims[key] = delims.get(key, 0) + 1
|
delims[key] = delims.get(key, 0) + 1
|
||||||
try:
|
try:
|
||||||
n = regexp.groupindex['space'] - 1
|
n = regexp.groupindex['space'] - 1
|
||||||
|
@ -248,7 +249,7 @@ class Sniffer:
|
||||||
return (quotechar, delim, skipinitialspace)
|
return (quotechar, delim, skipinitialspace)
|
||||||
|
|
||||||
|
|
||||||
def _guess_delimiter(self, data):
|
def _guess_delimiter(self, data, delimiters):
|
||||||
"""
|
"""
|
||||||
The delimiter /should/ occur the same number of times on
|
The delimiter /should/ occur the same number of times on
|
||||||
each row. However, due to malformed data, it may not. We don't want
|
each row. However, due to malformed data, it may not. We don't want
|
||||||
|
@ -316,7 +317,8 @@ class Sniffer:
|
||||||
while len(delims) == 0 and consistency >= threshold:
|
while len(delims) == 0 and consistency >= threshold:
|
||||||
for k, v in modeList:
|
for k, v in modeList:
|
||||||
if v[0] > 0 and v[1] > 0:
|
if v[0] > 0 and v[1] > 0:
|
||||||
if (v[1]/total) >= consistency:
|
if ((v[1]/total) >= consistency and
|
||||||
|
(delimiters is None or k in delimiters)):
|
||||||
delims[k] = v
|
delims[k] = v
|
||||||
consistency -= 0.01
|
consistency -= 0.01
|
||||||
|
|
||||||
|
|
|
@ -551,6 +551,12 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
|
||||||
header = '''\
|
header = '''\
|
||||||
"venue","city","state","date","performers"
|
"venue","city","state","date","performers"
|
||||||
'''
|
'''
|
||||||
|
sample3 = '''\
|
||||||
|
05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
|
||||||
|
05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
|
||||||
|
05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
|
||||||
|
'''
|
||||||
|
|
||||||
def test_has_header(self):
|
def test_has_header(self):
|
||||||
sniffer = csv.Sniffer()
|
sniffer = csv.Sniffer()
|
||||||
self.assertEqual(sniffer.has_header(self.sample1), False)
|
self.assertEqual(sniffer.has_header(self.sample1), False)
|
||||||
|
@ -568,6 +574,15 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
|
||||||
self.assertEqual(dialect.quotechar, "'")
|
self.assertEqual(dialect.quotechar, "'")
|
||||||
self.assertEqual(dialect.skipinitialspace, False)
|
self.assertEqual(dialect.skipinitialspace, False)
|
||||||
|
|
||||||
|
def test_delimiters(self):
|
||||||
|
sniffer = csv.Sniffer()
|
||||||
|
dialect = sniffer.sniff(self.sample3)
|
||||||
|
self.assertEqual(dialect.delimiter, "0")
|
||||||
|
dialect = sniffer.sniff(self.sample3, delimiters="?,")
|
||||||
|
self.assertEqual(dialect.delimiter, "?")
|
||||||
|
dialect = sniffer.sniff(self.sample3, delimiters="/,")
|
||||||
|
self.assertEqual(dialect.delimiter, "/")
|
||||||
|
|
||||||
if not hasattr(sys, "gettotalrefcount"):
|
if not hasattr(sys, "gettotalrefcount"):
|
||||||
if test_support.verbose: print "*** skipping leakage tests ***"
|
if test_support.verbose: print "*** skipping leakage tests ***"
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue