from __future__ import absolute_import
# Copyright (c) 2010-2019 openpyxl
"""Reader for a single worksheet."""
from io import BytesIO
from warnings import warn
# compatibility imports
from openpyxl.xml.functions import iterparse
# package imports
from openpyxl.cell import Cell
from openpyxl.worksheet.filters import AutoFilter, SortState
from openpyxl.cell.read_only import _cast_number
from openpyxl.cell.text import Text
from openpyxl.worksheet import Worksheet
from openpyxl.worksheet.dimensions import (
ColumnDimension,
RowDimension,
SheetFormatProperties,
)
from openpyxl.worksheet.header_footer import HeaderFooter
from openpyxl.worksheet.hyperlink import Hyperlink
from openpyxl.worksheet.merge import MergeCells
from openpyxl.worksheet.cell_range import CellRange
from openpyxl.worksheet.page import PageMargins, PrintOptions, PrintPageSetup
from openpyxl.worksheet.pagebreak import PageBreak
from openpyxl.worksheet.protection import SheetProtection
from openpyxl.worksheet.views import SheetViewList
from openpyxl.worksheet.datavalidation import DataValidationList
from openpyxl.xml.constants import (
SHEET_MAIN_NS,
REL_NS,
EXT_TYPES,
PKG_REL_NS
)
from openpyxl.xml.functions import safe_iterator, localname
from openpyxl.styles import Color
from openpyxl.styles import is_date_format
from openpyxl.formatting import Rule
from openpyxl.formatting.formatting import ConditionalFormatting
from openpyxl.formula.translate import Translator
from openpyxl.worksheet.properties import WorksheetProperties
from openpyxl.utils import (
coordinate_from_string,
get_column_letter,
column_index_from_string,
coordinate_to_tuple,
)
from openpyxl.utils.datetime import from_excel, from_ISO8601
from openpyxl.descriptors.excel import ExtensionList, Extension
from openpyxl.worksheet.table import TablePartList
def _get_xml_iter(xml_source):
"""
Possible inputs: strings, bytes, members of zipfile, temporary file
Always return a file like object
"""
if not hasattr(xml_source, 'read'):
try:
xml_source = xml_source.encode("utf-8")
except (AttributeError, UnicodeDecodeError):
pass
return BytesIO(xml_source)
else:
try:
xml_source.seek(0)
except:
# could be a zipfile
pass
return xml_source
[docs]class WorkSheetParser(object):
CELL_TAG = '{%s}c' % SHEET_MAIN_NS
VALUE_TAG = '{%s}v' % SHEET_MAIN_NS
FORMULA_TAG = '{%s}f' % SHEET_MAIN_NS
MERGE_TAG = '{%s}mergeCell' % SHEET_MAIN_NS
INLINE_STRING = "{%s}is" % SHEET_MAIN_NS
def __init__(self, ws, xml_source, shared_strings):
self.ws = ws
self.epoch = ws.parent.epoch
self.source = xml_source
self.shared_strings = shared_strings
self.guess_types = ws.parent.guess_types
self.data_only = ws.parent.data_only
self.styles = ws.parent._cell_styles
self.differential_styles = ws.parent._differential_styles
self.keep_vba = ws.parent.vba_archive is not None
self.shared_formula_masters = {} # {si_str: Translator()}
self._row_count = self._col_count = 0
self.tables = []
[docs] def parse(self):
dispatcher = {
'{%s}mergeCells' % SHEET_MAIN_NS: self.parse_merge,
'{%s}col' % SHEET_MAIN_NS: self.parse_column_dimensions,
'{%s}row' % SHEET_MAIN_NS: self.parse_row,
'{%s}conditionalFormatting' % SHEET_MAIN_NS: self.parser_conditional_formatting,
'{%s}legacyDrawing' % SHEET_MAIN_NS: self.parse_legacy_drawing,
'{%s}sheetProtection' % SHEET_MAIN_NS: self.parse_sheet_protection,
'{%s}extLst' % SHEET_MAIN_NS: self.parse_extensions,
'{%s}hyperlink' % SHEET_MAIN_NS: self.parse_hyperlinks,
'{%s}tableParts' % SHEET_MAIN_NS: self.parse_tables,
}
properties = {
'{%s}printOptions' % SHEET_MAIN_NS: ('print_options', PrintOptions),
'{%s}pageMargins' % SHEET_MAIN_NS: ('page_margins', PageMargins),
'{%s}pageSetup' % SHEET_MAIN_NS: ('page_setup', PrintPageSetup),
'{%s}headerFooter' % SHEET_MAIN_NS: ('HeaderFooter', HeaderFooter),
'{%s}autoFilter' % SHEET_MAIN_NS: ('auto_filter', AutoFilter),
'{%s}dataValidations' % SHEET_MAIN_NS: ('data_validations', DataValidationList),
#'{%s}sheet/{%s}sortState' % (SHEET_MAIN_NS, SHEET_MAIN_NS): ('sort_state', SortState),
'{%s}sheetPr' % SHEET_MAIN_NS: ('sheet_properties', WorksheetProperties),
'{%s}sheetViews' % SHEET_MAIN_NS: ('views', SheetViewList),
'{%s}sheetFormatPr' % SHEET_MAIN_NS: ('sheet_format', SheetFormatProperties),
'{%s}rowBreaks' % SHEET_MAIN_NS: ('page_breaks', PageBreak),
}
stream = _get_xml_iter(self.source)
it = iterparse(stream, tag=dispatcher)
for _, element in it:
tag_name = element.tag
if tag_name in dispatcher:
dispatcher[tag_name](element)
element.clear()
elif tag_name in properties:
prop = properties[tag_name]
obj = prop[1].from_tree(element)
setattr(self.ws, prop[0], obj)
element.clear()
self.ws._current_row = self.ws.max_row
[docs] def parse_cell(self, element):
value = element.find(self.VALUE_TAG)
if value is not None:
value = value.text
formula = element.find(self.FORMULA_TAG)
data_type = element.get('t', 'n')
coordinate = element.get('r')
self._col_count += 1
style_id = element.get('s')
# assign formula to cell value unless only the data is desired
if formula is not None and not self.data_only:
data_type = 'f'
if formula.text:
value = "=" + formula.text
else:
value = "="
formula_type = formula.get('t')
if formula_type:
if formula_type != "shared":
self.ws.formula_attributes[coordinate] = dict(formula.attrib)
else:
si = formula.get('si') # Shared group index for shared formulas
# The spec (18.3.1.40) defines shared formulae in
# terms of the following:
#
# `master`: "The first formula in a group of shared
# formulas"
# `ref`: "Range of cells which the formula applies
# to." It's a required attribute on the master
# cell, forbidden otherwise.
# `shared cell`: "A cell is shared only when si is
# used and t is `shared`."
#
# Whether to use the cell's given formula or the
# master's depends on whether the cell is shared,
# whether it's in the ref, and whether it defines its
# own formula, as follows:
#
# Shared? Has formula? | In ref Not in ref
# ========= ==============|======== ===============
# Yes Yes | master impl. defined
# No Yes | own own
# Yes No | master impl. defined
# No No | ?? N/A
#
# The ?? is because the spec is silent on this issue,
# though my inference is that the cell does not
# receive a formula at all.
#
# For this implementation, we are using the master
# formula in the two "impl. defined" cases and no
# formula in the "??" case. This choice of
# implementation allows us to disregard the `ref`
# parameter altogether, and does not require
# computing expressions like `C5 in A1:D6`.
# Presumably, Excel does not generate spreadsheets
# with such contradictions.
if si in self.shared_formula_masters:
trans = self.shared_formula_masters[si]
value = trans.translate_formula(coordinate)
else:
self.shared_formula_masters[si] = Translator(value, coordinate)
style_array = None
if style_id is not None:
style_id = int(style_id)
style_array = self.styles[style_id]
if coordinate:
row, column = coordinate_to_tuple(coordinate)
else:
row, column = self._row_count, self._col_count
cell = Cell(self.ws, row=row, col_idx=column, style_array=style_array)
self.ws._cells[(row, column)] = cell
if value is not None:
if data_type == 'n':
value = _cast_number(value)
if is_date_format(cell.number_format):
data_type = 'd'
value = from_excel(value, self.epoch)
elif data_type == 'b':
value = bool(int(value))
elif data_type == 's':
value = self.shared_strings[int(value)]
elif data_type == 'str':
data_type = 's'
elif data_type == 'd':
value = from_ISO8601(value)
else:
if data_type == 'inlineStr':
child = element.find(self.INLINE_STRING)
if child is not None:
data_type = 's'
richtext = Text.from_tree(child)
value = richtext.content
if self.guess_types or value is None:
cell.value = value
else:
cell._value = value
cell.data_type = data_type
[docs] def parse_merge(self, element):
merged = MergeCells.from_tree(element)
self.ws.merged_cells.ranges = merged.mergeCell
for cr in merged.mergeCell:
self.ws._clean_merge_range(cr)
[docs] def parse_column_dimensions(self, col):
attrs = dict(col.attrib)
column = get_column_letter(int(attrs['min']))
attrs['index'] = column
if 'style' in attrs:
attrs['style'] = self.styles[int(attrs['style'])]
dim = ColumnDimension(self.ws, **attrs)
self.ws.column_dimensions[column] = dim
[docs] def parse_row(self, row):
attrs = dict(row.attrib)
if "r" in attrs:
self._row_count = int(attrs['r'])
else:
self._row_count += 1
self._col_count = 0
keys = set(attrs)
for key in keys:
if key == "s":
attrs['s'] = self.styles[int(attrs['s'])]
elif key.startswith('{'):
del attrs[key]
keys = set(attrs)
if keys != set(['r', 'spans']) and keys != set(['r']):
# don't create dimension objects unless they have relevant information
dim = RowDimension(self.ws, **attrs)
self.ws.row_dimensions[dim.index] = dim
for cell in safe_iterator(row, self.CELL_TAG):
self.parse_cell(cell)
[docs] def parse_sheet_protection(self, element):
self.ws.protection = SheetProtection.from_tree(element)
password = element.get("password")
if password is not None:
self.ws.protection.set_password(password, True)
[docs] def parse_legacy_drawing(self, element):
if self.keep_vba:
# For now just save the legacy drawing id.
# We will later look up the file name
self.ws.legacy_drawing = element.get('{%s}id' % REL_NS)
[docs] def parse_extensions(self, element):
extLst = ExtensionList.from_tree(element)
for e in extLst.ext:
ext_type = EXT_TYPES.get(e.uri.upper(), "Unknown")
msg = "{0} extension is not supported and will be removed".format(ext_type)
warn(msg)
[docs] def parse_hyperlinks(self, element):
link = Hyperlink.from_tree(element)
if link.id:
rel = self.ws._rels[link.id]
link.target = rel.Target
if ":" in link.ref:
# range of cells
for row in self.ws[link.ref]:
for cell in row:
cell.hyperlink = link
else:
self.ws[link.ref].hyperlink = link
[docs] def parse_tables(self, element):
for t in TablePartList.from_tree(element).tablePart:
rel = self.ws._rels[t.id]
self.tables.append(rel.Target)