2022-10-05 14:43:46 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2022-10-14 14:53:06 +00:00
|
|
|
import fsutil
|
2022-10-05 14:43:46 +00:00
|
|
|
from openpyxl import load_workbook
|
|
|
|
from slugify import slugify
|
|
|
|
from xlrd import open_workbook
|
2022-10-14 14:53:06 +00:00
|
|
|
|
|
|
|
from benedict.serializers.abstract import AbstractSerializer
|
2022-10-05 14:43:46 +00:00
|
|
|
|
|
|
|
|
|
|
|
class XLSSerializer(AbstractSerializer):
|
|
|
|
"""
|
|
|
|
This class describes a xls serializer.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
super(XLSSerializer, self).__init__(
|
|
|
|
extensions=[
|
|
|
|
"xls",
|
|
|
|
"xlsx",
|
|
|
|
"xlsm",
|
|
|
|
],
|
|
|
|
)
|
|
|
|
|
2022-10-14 14:53:06 +00:00
|
|
|
def _get_sheet_index_and_name_from_options(self, **kwargs):
|
2022-10-05 14:43:46 +00:00
|
|
|
sheet_index_or_name = kwargs.pop("sheet", 0)
|
|
|
|
sheet_index = 0
|
|
|
|
sheet_name = ""
|
|
|
|
if isinstance(sheet_index_or_name, int):
|
|
|
|
sheet_index = sheet_index_or_name
|
|
|
|
elif isinstance(sheet_index_or_name, str):
|
|
|
|
sheet_name = sheet_index_or_name
|
|
|
|
return (sheet_index, sheet_name)
|
|
|
|
|
2022-10-14 14:53:06 +00:00
|
|
|
def _get_sheet_index_by_name(self, sheet_name, sheet_names):
|
|
|
|
sheet_names = list([slugify(name) for name in sheet_names])
|
|
|
|
try:
|
|
|
|
sheet_index = sheet_names.index(slugify(sheet_name))
|
|
|
|
return sheet_index
|
|
|
|
except ValueError:
|
|
|
|
raise Exception(f"Invalid sheet name '{sheet_name}', sheet not found.")
|
|
|
|
|
|
|
|
def _get_sheet_columns_indexes(self, columns_count):
|
|
|
|
return [column_index for column_index in range(columns_count)]
|
|
|
|
|
2022-10-05 14:43:46 +00:00
|
|
|
def _decode_legacy(self, s, **kwargs):
|
|
|
|
filepath = s
|
|
|
|
|
|
|
|
# load the worksheet
|
|
|
|
workbook = open_workbook(filename=filepath)
|
|
|
|
|
|
|
|
# get sheet by index or by name
|
2022-10-14 14:53:06 +00:00
|
|
|
sheet_index, sheet_name = self._get_sheet_index_and_name_from_options(**kwargs)
|
2022-10-05 14:43:46 +00:00
|
|
|
if sheet_name:
|
|
|
|
sheet_names = workbook.sheet_names()
|
2022-10-14 14:53:06 +00:00
|
|
|
sheet_index = self._get_sheet_index_by_name(sheet_name, sheet_names)
|
2022-10-05 14:43:46 +00:00
|
|
|
sheet = workbook.sheet_by_index(sheet_index)
|
|
|
|
sheet_columns_range = range(sheet.ncols)
|
|
|
|
|
|
|
|
# get columns
|
|
|
|
columns = kwargs.pop("columns", None)
|
|
|
|
columns_row = kwargs.pop("columns_row", True)
|
|
|
|
columns_standardized = kwargs.pop("columns_standardized", columns is None)
|
|
|
|
if not columns:
|
|
|
|
if columns_row:
|
|
|
|
# if first row is for column names read the names
|
|
|
|
# for row in sheet.iter_rows(min_row=1, max_row=1):
|
|
|
|
columns = [
|
|
|
|
sheet.cell_value(0, col_index) for col_index in sheet_columns_range
|
|
|
|
]
|
|
|
|
else:
|
|
|
|
# otherwise use columns indexes as column names
|
|
|
|
# for row in sheet.iter_rows(min_row=1, max_row=1):
|
2022-10-14 14:53:06 +00:00
|
|
|
columns = self._get_sheet_columns_indexes(sheet_columns_range)
|
2022-10-05 14:43:46 +00:00
|
|
|
|
|
|
|
# standardize column names, eg. "Date Created" -> "date_created"
|
|
|
|
if columns_standardized:
|
|
|
|
columns = [slugify(column, separator="_") for column in columns]
|
|
|
|
|
|
|
|
# build list of dicts, one for each row
|
|
|
|
items = []
|
|
|
|
items_row_start = 1 if columns_row else 0
|
|
|
|
for row_index in range(items_row_start, sheet.nrows):
|
|
|
|
row = {}
|
|
|
|
for col_index in sheet_columns_range:
|
|
|
|
col_key = columns[col_index]
|
|
|
|
value = sheet.cell_value(row_index, col_index)
|
|
|
|
row[col_key] = value
|
|
|
|
items.append(row)
|
|
|
|
|
|
|
|
# print(items)
|
|
|
|
return items
|
|
|
|
|
|
|
|
def _decode(self, s, **kwargs):
|
|
|
|
filepath = s
|
|
|
|
|
|
|
|
# load the worksheet
|
|
|
|
workbook = load_workbook(filename=filepath, read_only=True)
|
|
|
|
|
2022-10-14 14:53:06 +00:00
|
|
|
# get sheet by index or by name
|
|
|
|
sheet_index, sheet_name = self._get_sheet_index_and_name_from_options(**kwargs)
|
2022-10-05 14:43:46 +00:00
|
|
|
sheets = [sheet for sheet in workbook]
|
|
|
|
if sheet_name:
|
2022-10-14 14:53:06 +00:00
|
|
|
sheet_names = [sheet.title for sheet in sheets]
|
|
|
|
sheet_index = self._get_sheet_index_by_name(sheet_name, sheet_names)
|
2022-10-05 14:43:46 +00:00
|
|
|
sheet = sheets[sheet_index]
|
|
|
|
sheet_columns_cells = list(sheet.iter_rows(min_row=1, max_row=1))[0]
|
|
|
|
|
|
|
|
# get columns
|
|
|
|
columns = kwargs.pop("columns", None)
|
|
|
|
columns_row = kwargs.pop("columns_row", True)
|
|
|
|
columns_standardized = kwargs.pop("columns_standardized", columns is None)
|
|
|
|
if not columns:
|
|
|
|
if columns_row:
|
|
|
|
# if first row is for column names read the names
|
|
|
|
# for row in sheet.iter_rows(min_row=1, max_row=1):
|
|
|
|
columns = [cell.value for cell in sheet_columns_cells]
|
|
|
|
else:
|
|
|
|
# otherwise use columns indexes as column names
|
|
|
|
# for row in sheet.iter_rows(min_row=1, max_row=1):
|
2022-10-14 14:53:06 +00:00
|
|
|
columns = self._get_sheet_columns_indexes(len(sheet_columns_cells))
|
2022-10-05 14:43:46 +00:00
|
|
|
|
|
|
|
# standardize column names, eg. "Date Created" -> "date_created"
|
|
|
|
if columns_standardized:
|
|
|
|
columns = [slugify(column, separator="_") for column in columns]
|
|
|
|
|
|
|
|
# build list of dicts, one for each row
|
|
|
|
items = []
|
|
|
|
items_row_start = 2 if columns_row else 1
|
|
|
|
for row in sheet.iter_rows(min_row=items_row_start):
|
|
|
|
values = list([cell.value for cell in row])
|
|
|
|
items.append(dict(zip(columns, values)))
|
|
|
|
|
|
|
|
# close the worksheet
|
|
|
|
workbook.close()
|
|
|
|
|
|
|
|
# print(items)
|
|
|
|
return items
|
|
|
|
|
|
|
|
def decode(self, s, **kwargs):
|
|
|
|
extension = fsutil.get_file_extension(s)
|
|
|
|
if extension in ["xlsx", "xlsm"]:
|
|
|
|
return self._decode(s, **kwargs)
|
|
|
|
elif extension in ["xls", "xlt"]:
|
|
|
|
return self._decode_legacy(s, **kwargs)
|
|
|
|
|
|
|
|
def encode(self, d, **kwargs):
|
|
|
|
raise NotImplementedError
|