Source code for openpyxl.reader.worksheet

from __future__ import absolute_import
# Copyright (c) 2010-2019 openpyxl

"""Reader for a single worksheet."""
from io import BytesIO
from warnings import warn

# compatibility imports
from openpyxl.xml.functions import iterparse

# package imports
from openpyxl.cell import Cell
from openpyxl.worksheet.filters import AutoFilter, SortState
from openpyxl.cell.read_only import _cast_number
from openpyxl.cell.text import Text
from openpyxl.worksheet import Worksheet
from openpyxl.worksheet.dimensions import (
    ColumnDimension,
    RowDimension,
    SheetFormatProperties,
)
from openpyxl.worksheet.header_footer import HeaderFooter
from openpyxl.worksheet.hyperlink import Hyperlink
from openpyxl.worksheet.merge import MergeCells
from openpyxl.worksheet.cell_range import CellRange
from openpyxl.worksheet.page import PageMargins, PrintOptions, PrintPageSetup
from openpyxl.worksheet.pagebreak import PageBreak
from openpyxl.worksheet.protection import SheetProtection
from openpyxl.worksheet.views import SheetViewList
from openpyxl.worksheet.datavalidation import DataValidationList
from openpyxl.xml.constants import (
    SHEET_MAIN_NS,
    REL_NS,
    EXT_TYPES,
    PKG_REL_NS
)
from openpyxl.xml.functions import safe_iterator, localname
from openpyxl.styles import Color
from openpyxl.styles import is_date_format
from openpyxl.formatting import Rule
from openpyxl.formatting.formatting import ConditionalFormatting
from openpyxl.formula.translate import Translator
from openpyxl.worksheet.properties import WorksheetProperties
from openpyxl.utils import (
    coordinate_from_string,
    get_column_letter,
    column_index_from_string,
    coordinate_to_tuple,
    )
from openpyxl.utils.datetime import from_excel, from_ISO8601
from openpyxl.descriptors.excel import ExtensionList, Extension
from openpyxl.worksheet.table import TablePartList


def _get_xml_iter(xml_source):
    """
    Possible inputs: strings, bytes, members of zipfile, temporary file
    Always return a file like object
    """
    if not hasattr(xml_source, 'read'):
        try:
            xml_source = xml_source.encode("utf-8")
        except (AttributeError, UnicodeDecodeError):
            pass
        return BytesIO(xml_source)
    else:
        try:
            xml_source.seek(0)
        except:
            # could be a zipfile
            pass
        return xml_source


[docs]class WorkSheetParser(object): CELL_TAG = '{%s}c' % SHEET_MAIN_NS VALUE_TAG = '{%s}v' % SHEET_MAIN_NS FORMULA_TAG = '{%s}f' % SHEET_MAIN_NS MERGE_TAG = '{%s}mergeCell' % SHEET_MAIN_NS INLINE_STRING = "{%s}is" % SHEET_MAIN_NS def __init__(self, ws, xml_source, shared_strings): self.ws = ws self.epoch = ws.parent.epoch self.source = xml_source self.shared_strings = shared_strings self.guess_types = ws.parent.guess_types self.data_only = ws.parent.data_only self.styles = ws.parent._cell_styles self.differential_styles = ws.parent._differential_styles self.keep_vba = ws.parent.vba_archive is not None self.shared_formula_masters = {} # {si_str: Translator()} self._row_count = self._col_count = 0 self.tables = []
[docs] def parse(self): dispatcher = { '{%s}mergeCells' % SHEET_MAIN_NS: self.parse_merge, '{%s}col' % SHEET_MAIN_NS: self.parse_column_dimensions, '{%s}row' % SHEET_MAIN_NS: self.parse_row, '{%s}conditionalFormatting' % SHEET_MAIN_NS: self.parser_conditional_formatting, '{%s}legacyDrawing' % SHEET_MAIN_NS: self.parse_legacy_drawing, '{%s}sheetProtection' % SHEET_MAIN_NS: self.parse_sheet_protection, '{%s}extLst' % SHEET_MAIN_NS: self.parse_extensions, '{%s}hyperlink' % SHEET_MAIN_NS: self.parse_hyperlinks, '{%s}tableParts' % SHEET_MAIN_NS: self.parse_tables, } properties = { '{%s}printOptions' % SHEET_MAIN_NS: ('print_options', PrintOptions), '{%s}pageMargins' % SHEET_MAIN_NS: ('page_margins', PageMargins), '{%s}pageSetup' % SHEET_MAIN_NS: ('page_setup', PrintPageSetup), '{%s}headerFooter' % SHEET_MAIN_NS: ('HeaderFooter', HeaderFooter), '{%s}autoFilter' % SHEET_MAIN_NS: ('auto_filter', AutoFilter), '{%s}dataValidations' % SHEET_MAIN_NS: ('data_validations', DataValidationList), #'{%s}sheet/{%s}sortState' % (SHEET_MAIN_NS, SHEET_MAIN_NS): ('sort_state', SortState), '{%s}sheetPr' % SHEET_MAIN_NS: ('sheet_properties', WorksheetProperties), '{%s}sheetViews' % SHEET_MAIN_NS: ('views', SheetViewList), '{%s}sheetFormatPr' % SHEET_MAIN_NS: ('sheet_format', SheetFormatProperties), '{%s}rowBreaks' % SHEET_MAIN_NS: ('page_breaks', PageBreak), } stream = _get_xml_iter(self.source) it = iterparse(stream, tag=dispatcher) for _, element in it: tag_name = element.tag if tag_name in dispatcher: dispatcher[tag_name](element) element.clear() elif tag_name in properties: prop = properties[tag_name] obj = prop[1].from_tree(element) setattr(self.ws, prop[0], obj) element.clear() self.ws._current_row = self.ws.max_row
[docs] def parse_cell(self, element): value = element.find(self.VALUE_TAG) if value is not None: value = value.text formula = element.find(self.FORMULA_TAG) data_type = element.get('t', 'n') coordinate = element.get('r') self._col_count += 1 style_id = element.get('s') # assign formula to cell value unless only the data is desired if formula is not None and not self.data_only: data_type = 'f' if formula.text: value = "=" + formula.text else: value = "=" formula_type = formula.get('t') if formula_type: if formula_type != "shared": self.ws.formula_attributes[coordinate] = dict(formula.attrib) else: si = formula.get('si') # Shared group index for shared formulas # The spec (18.3.1.40) defines shared formulae in # terms of the following: # # `master`: "The first formula in a group of shared # formulas" # `ref`: "Range of cells which the formula applies # to." It's a required attribute on the master # cell, forbidden otherwise. # `shared cell`: "A cell is shared only when si is # used and t is `shared`." # # Whether to use the cell's given formula or the # master's depends on whether the cell is shared, # whether it's in the ref, and whether it defines its # own formula, as follows: # # Shared? Has formula? | In ref Not in ref # ========= ==============|======== =============== # Yes Yes | master impl. defined # No Yes | own own # Yes No | master impl. defined # No No | ?? N/A # # The ?? is because the spec is silent on this issue, # though my inference is that the cell does not # receive a formula at all. # # For this implementation, we are using the master # formula in the two "impl. defined" cases and no # formula in the "??" case. This choice of # implementation allows us to disregard the `ref` # parameter altogether, and does not require # computing expressions like `C5 in A1:D6`. # Presumably, Excel does not generate spreadsheets # with such contradictions. if si in self.shared_formula_masters: trans = self.shared_formula_masters[si] value = trans.translate_formula(coordinate) else: self.shared_formula_masters[si] = Translator(value, coordinate) style_array = None if style_id is not None: style_id = int(style_id) style_array = self.styles[style_id] if coordinate: row, column = coordinate_to_tuple(coordinate) else: row, column = self._row_count, self._col_count cell = Cell(self.ws, row=row, col_idx=column, style_array=style_array) self.ws._cells[(row, column)] = cell if value is not None: if data_type == 'n': value = _cast_number(value) if is_date_format(cell.number_format): data_type = 'd' value = from_excel(value, self.epoch) elif data_type == 'b': value = bool(int(value)) elif data_type == 's': value = self.shared_strings[int(value)] elif data_type == 'str': data_type = 's' elif data_type == 'd': value = from_ISO8601(value) else: if data_type == 'inlineStr': child = element.find(self.INLINE_STRING) if child is not None: data_type = 's' richtext = Text.from_tree(child) value = richtext.content if self.guess_types or value is None: cell.value = value else: cell._value = value cell.data_type = data_type
[docs] def parse_merge(self, element): merged = MergeCells.from_tree(element) self.ws.merged_cells.ranges = merged.mergeCell for cr in merged.mergeCell: self.ws._clean_merge_range(cr)
[docs] def parse_column_dimensions(self, col): attrs = dict(col.attrib) column = get_column_letter(int(attrs['min'])) attrs['index'] = column if 'style' in attrs: attrs['style'] = self.styles[int(attrs['style'])] dim = ColumnDimension(self.ws, **attrs) self.ws.column_dimensions[column] = dim
[docs] def parse_row(self, row): attrs = dict(row.attrib) if "r" in attrs: self._row_count = int(attrs['r']) else: self._row_count += 1 self._col_count = 0 keys = set(attrs) for key in keys: if key == "s": attrs['s'] = self.styles[int(attrs['s'])] elif key.startswith('{'): del attrs[key] keys = set(attrs) if keys != set(['r', 'spans']) and keys != set(['r']): # don't create dimension objects unless they have relevant information dim = RowDimension(self.ws, **attrs) self.ws.row_dimensions[dim.index] = dim for cell in safe_iterator(row, self.CELL_TAG): self.parse_cell(cell)
[docs] def parser_conditional_formatting(self, element): cf = ConditionalFormatting.from_tree(element) for rule in cf.rules: if rule.dxfId is not None: rule.dxf = self.differential_styles[rule.dxfId] self.ws.conditional_formatting[cf] = rule
[docs] def parse_sheet_protection(self, element): self.ws.protection = SheetProtection.from_tree(element) password = element.get("password") if password is not None: self.ws.protection.set_password(password, True)
[docs] def parse_legacy_drawing(self, element): if self.keep_vba: # For now just save the legacy drawing id. # We will later look up the file name self.ws.legacy_drawing = element.get('{%s}id' % REL_NS)
[docs] def parse_extensions(self, element): extLst = ExtensionList.from_tree(element) for e in extLst.ext: ext_type = EXT_TYPES.get(e.uri.upper(), "Unknown") msg = "{0} extension is not supported and will be removed".format(ext_type) warn(msg)
[docs] def parse_tables(self, element): for t in TablePartList.from_tree(element).tablePart: rel = self.ws._rels[t.id] self.tables.append(rel.Target)