mirror of https://github.com/python/cpython.git
bpo-43625: Enhance csv sniffer has_headers() to be more accurate (GH-26939)
This commit is contained in:
parent
e3f877c32d
commit
ceea579ccc
|
@ -269,6 +269,20 @@ The :mod:`csv` module defines the following classes:
|
|||
|
||||
Analyze the sample text (presumed to be in CSV format) and return
|
||||
:const:`True` if the first row appears to be a series of column headers.
|
||||
Inspecting each column, one of two key criteria will be considered to
|
||||
estimate if the sample contains a header:
|
||||
|
||||
- the second through n-th rows contain numeric values
|
||||
- the second through n-th rows contain strings where at least one value's
|
||||
length differs from that of the putative header of that column.
|
||||
|
||||
Twenty rows after the first row are sampled; if more than half of columns +
|
||||
rows meet the criteria, :const:`True` is returned.
|
||||
|
||||
.. note::
|
||||
|
||||
This method is a rough heuristic and may produce both false positives and
|
||||
negatives.
|
||||
|
||||
An example for :class:`Sniffer` use::
|
||||
|
||||
|
|
12
Lib/csv.py
12
Lib/csv.py
|
@ -409,14 +409,10 @@ def has_header(self, sample):
|
|||
continue # skip rows that have irregular number of columns
|
||||
|
||||
for col in list(columnTypes.keys()):
|
||||
|
||||
for thisType in [int, float, complex]:
|
||||
try:
|
||||
thisType(row[col])
|
||||
break
|
||||
except (ValueError, OverflowError):
|
||||
pass
|
||||
else:
|
||||
thisType = complex
|
||||
try:
|
||||
thisType(row[col])
|
||||
except (ValueError, OverflowError):
|
||||
# fallback to length of string
|
||||
thisType = len(row[col])
|
||||
|
||||
|
|
|
@ -1020,6 +1020,42 @@ class TestSniffer(unittest.TestCase):
|
|||
'Stonecutters ''Seafood'' and Chop House'+ 'Lemont'+ 'IL'+ '12/19/02'+ 'Week Back'
|
||||
"""
|
||||
|
||||
sample10 = dedent("""
|
||||
abc,def
|
||||
ghijkl,mno
|
||||
ghi,jkl
|
||||
""")
|
||||
|
||||
sample11 = dedent("""
|
||||
abc,def
|
||||
ghijkl,mnop
|
||||
ghi,jkl
|
||||
""")
|
||||
|
||||
sample12 = dedent(""""time","forces"
|
||||
1,1.5
|
||||
0.5,5+0j
|
||||
0,0
|
||||
1+1j,6
|
||||
""")
|
||||
|
||||
sample13 = dedent(""""time","forces"
|
||||
0,0
|
||||
1,2
|
||||
a,b
|
||||
""")
|
||||
|
||||
def test_issue43625(self):
|
||||
sniffer = csv.Sniffer()
|
||||
self.assertTrue(sniffer.has_header(self.sample12))
|
||||
self.assertFalse(sniffer.has_header(self.sample13))
|
||||
|
||||
def test_has_header_strings(self):
|
||||
"More to document existing (unexpected?) behavior than anything else."
|
||||
sniffer = csv.Sniffer()
|
||||
self.assertFalse(sniffer.has_header(self.sample10))
|
||||
self.assertFalse(sniffer.has_header(self.sample11))
|
||||
|
||||
def test_has_header(self):
|
||||
sniffer = csv.Sniffer()
|
||||
self.assertIs(sniffer.has_header(self.sample1), False)
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Fix a bug in the detection of CSV file headers by
|
||||
:meth:`csv.Sniffer.has_header` and improve documentation of same.
|
Loading…
Reference in New Issue