* Correct Sniffer doc to correspond to the implementation.

* Add optional delimiters arg to Sniffer.sniff() which restricts the set of
  candidate field delimiters.
This commit is contained in:
Skip Montanaro 2003-05-19 15:33:36 +00:00
parent c626658a28
commit 7789237331
3 changed files with 31 additions and 14 deletions

View file

@ -152,17 +152,17 @@ attributes, which are used to define the parameters for a specific
\class{reader} or \class{writer} instance. \class{reader} or \class{writer} instance.
\end{classdesc*} \end{classdesc*}
\begin{classdesc}{Sniffer}{\optional{sample=16384}} \begin{classdesc}{Sniffer}{}
The \class{Sniffer} class is used to deduce the format of a CSV file. The The \class{Sniffer} class is used to deduce the format of a CSV file.
optional \var{sample} argument to the constructor specifies the number of
bytes to use when determining Dialect parameters.
\end{classdesc} \end{classdesc}
The \class{Sniffer} class provides a single method: The \class{Sniffer} class provides a single method:
\begin{methoddesc}{sniff}{fileobj} \begin{methoddesc}{sniff}{sample\optional{,delimiters=None}}
Analyze the next chunk of \var{fileobj} and return a \class{Dialect} subclass Analyze the given \var{sample} and return a \class{Dialect} subclass
reflecting the parameters found. reflecting the parameters found. If the optional \var{delimiters} parameter
is given, it is interpreted as a string containing possible valid delimiter
characters.
\end{methoddesc} \end{methoddesc}
\begin{methoddesc}{has_header}{sample} \begin{methoddesc}{has_header}{sample}

View file

@ -159,15 +159,16 @@ class Sniffer:
self.preferred = [',', '\t', ';', ' ', ':'] self.preferred = [',', '\t', ';', ' ', ':']
def sniff(self, sample): def sniff(self, sample, delimiters=None):
""" """
Returns a dialect (or None) corresponding to the sample Returns a dialect (or None) corresponding to the sample
""" """
quotechar, delimiter, skipinitialspace = \ quotechar, delimiter, skipinitialspace = \
self._guess_quote_and_delimiter(sample) self._guess_quote_and_delimiter(sample, delimiters)
if delimiter is None: if delimiter is None:
delimiter, skipinitialspace = self._guess_delimiter(sample) delimiter, skipinitialspace = self._guess_delimiter(sample,
delimiters)
class dialect(Dialect): class dialect(Dialect):
_name = "sniffed" _name = "sniffed"
@ -184,7 +185,7 @@ class Sniffer:
return dialect return dialect
def _guess_quote_and_delimiter(self, data): def _guess_quote_and_delimiter(self, data, delimiters):
""" """
Looks for text enclosed between two identical quotes Looks for text enclosed between two identical quotes
(the probable quotechar) which are preceded and followed (the probable quotechar) which are preceded and followed
@ -222,7 +223,7 @@ class Sniffer:
key = m[n] key = m[n]
except KeyError: except KeyError:
continue continue
if key: if key and (delimiters is None or key in delimiters):
delims[key] = delims.get(key, 0) + 1 delims[key] = delims.get(key, 0) + 1
try: try:
n = regexp.groupindex['space'] - 1 n = regexp.groupindex['space'] - 1
@ -248,7 +249,7 @@ class Sniffer:
return (quotechar, delim, skipinitialspace) return (quotechar, delim, skipinitialspace)
def _guess_delimiter(self, data): def _guess_delimiter(self, data, delimiters):
""" """
The delimiter /should/ occur the same number of times on The delimiter /should/ occur the same number of times on
each row. However, due to malformed data, it may not. We don't want each row. However, due to malformed data, it may not. We don't want
@ -316,7 +317,8 @@ class Sniffer:
while len(delims) == 0 and consistency >= threshold: while len(delims) == 0 and consistency >= threshold:
for k, v in modeList: for k, v in modeList:
if v[0] > 0 and v[1] > 0: if v[0] > 0 and v[1] > 0:
if (v[1]/total) >= consistency: if ((v[1]/total) >= consistency and
(delimiters is None or k in delimiters)):
delims[k] = v delims[k] = v
consistency -= 0.01 consistency -= 0.01

View file

@ -551,6 +551,12 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
header = '''\ header = '''\
"venue","city","state","date","performers" "venue","city","state","date","performers"
''' '''
sample3 = '''\
05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
'''
def test_has_header(self): def test_has_header(self):
sniffer = csv.Sniffer() sniffer = csv.Sniffer()
self.assertEqual(sniffer.has_header(self.sample1), False) self.assertEqual(sniffer.has_header(self.sample1), False)
@ -568,6 +574,15 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
self.assertEqual(dialect.quotechar, "'") self.assertEqual(dialect.quotechar, "'")
self.assertEqual(dialect.skipinitialspace, False) self.assertEqual(dialect.skipinitialspace, False)
def test_delimiters(self):
sniffer = csv.Sniffer()
dialect = sniffer.sniff(self.sample3)
self.assertEqual(dialect.delimiter, "0")
dialect = sniffer.sniff(self.sample3, delimiters="?,")
self.assertEqual(dialect.delimiter, "?")
dialect = sniffer.sniff(self.sample3, delimiters="/,")
self.assertEqual(dialect.delimiter, "/")
if not hasattr(sys, "gettotalrefcount"): if not hasattr(sys, "gettotalrefcount"):
if test_support.verbose: print "*** skipping leakage tests ***" if test_support.verbose: print "*** skipping leakage tests ***"
else: else: