This commit is contained in:
Sven Riwoldt
2024-10-19 12:31:37 +02:00
commit f7f8c52455
10176 changed files with 1619386 additions and 0 deletions

View File

@@ -0,0 +1,25 @@
from .handlers import Handler
from .reader import BIFF12Reader
from .workbook import Workbook
from .worksheet import Worksheet
__version__ = '1.0.10'
def open_workbook(name, debug=False):
from zipfile import ZipFile
zf = ZipFile(name, 'r')
return Workbook(fp=zf, debug=debug)
def convert_date(date):
if not isinstance(date, int) and not isinstance(date, float):
return None
from datetime import datetime, timedelta
if int(date) == 0:
return datetime(1900, 1, 1, 0, 0, 0) + timedelta(seconds=round(date * 24 * 60 * 60))
elif int(date) >= 61:
# According to Lotus 1-2-3, Feb 29th 1900 is a real thing, therefore we have to remove one day after that date
return datetime(1899, 12, 31, 0, 0, 0) + timedelta(days=int(date) - 1, seconds=round((date % 1) * 24 * 60 * 60))
else:
# Feb 29th 1900 will show up as Mar 1st 1900 because Python won't handle that date
return datetime(1899, 12, 31, 0, 0, 0) + timedelta(days=int(date), seconds=round((date % 1) * 24 * 60 * 60))

View File

@@ -0,0 +1,151 @@
# Workbook records
DEFINEDNAME = 0x0027
FILEVERSION = 0x0180
WORKBOOK = 0x0183
WORKBOOK_END = 0x0184
BOOKVIEWS = 0x0187
BOOKVIEWS_END = 0x0188
SHEETS = 0x018F
SHEETS_END = 0x0190
WORKBOOKPR = 0x0199
SHEET = 0x019C
CALCPR = 0x019D
WORKBOOKVIEW = 0x019E
EXTERNALREFERENCES = 0x02E1
EXTERNALREFERENCES_END = 0x02E2
EXTERNALREFERENCE = 0x02E3
WEBPUBLISHING = 0x04A9
# Worksheet records
ROW = 0x0000
BLANK = 0x0001
NUM = 0x0002
BOOLERR = 0x0003
BOOL = 0x0004
FLOAT = 0x0005
STRING = 0x0007
FORMULA_STRING = 0x0008
FORMULA_FLOAT = 0x0009
FORMULA_BOOL = 0x000A
FORMULA_BOOLERR = 0x000B
COL = 0x003C
WORKSHEET = 0x0181
WORKSHEET_END = 0x0182
SHEETVIEWS = 0x0185
SHEETVIEWS_END = 0x0186
SHEETVIEW = 0x0189
SHEETVIEW_END = 0x018A
SHEETDATA = 0x0191
SHEETDATA_END = 0x0192
SHEETPR = 0x0193
DIMENSION = 0x0194
SELECTION = 0x0198
COLS = 0x0386
COLS_END = 0x0387
CONDITIONALFORMATTING = 0x03CD
CONDITIONALFORMATTING_END = 0x03CE
CFRULE = 0x03CF
CFRULE_END = 0x03D0
ICONSET = 0x03D1
ICONSET_END = 0x03D2
DATABAR = 0x03D3
DATABAR_END = 0x03D4
COLORSCALE = 0x03D5
COLORSCALE_END = 0x03D6
CFVO = 0x03D7
PAGEMARGINS = 0x03DC
PRINTOPTIONS = 0x03DD
PAGESETUP = 0x03DE
HEADERFOOTER = 0x03DF
SHEETFORMATPR = 0x03E5
HYPERLINK = 0x03EE
DRAWING = 0x04A6
LEGACYDRAWING = 0x04A7
COLOR = 0x04B4
OLEOBJECTS = 0x04FE
OLEOBJECT = 0x04FF
OLEOBJECTS_END = 0x0580
TABLEPARTS = 0x0594
TABLEPART = 0x0595
TABLEPARTS_END = 0x0596
# SharedStrings records
SI = 0x0013
SST = 0x019F
SST_END = 0x01A0
# Styles records
FONT = 0x002B
FILL = 0x002D
BORDER = 0x002E
XF = 0x002F
CELLSTYLE = 0x0030
STYLESHEET = 0x0296
STYLESHEET_END = 0x0297
COLORS = 0x03D9
COLORS_END = 0x03DA
DXFS = 0x03F9
DXFS_END = 0x03FA
TABLESTYLES = 0x03FC
TABLESTYLES_END = 0x03FD
FILLS = 0x04DB
FILLS_END = 0x04DC
FONTS = 0x04E3
FONTS_END = 0x04E4
BORDERS = 0x04E5
BORDERS_END = 0x04E6
CELLXFS = 0x04E9
CELLXFS_END = 0x04EA
CELLSTYLES = 0x04EB
CELLSTYLES_END = 0x04EC
CELLSTYLEXFS = 0x04F2
CELLSTYLEXFS_END = 0x04F3
# Comment records
COMMENTS = 0x04F4
COMMENTS_END = 0x04F5
AUTHORS = 0x04F6
AUTHORS_END = 0x04F7
AUTHOR = 0x04F8
COMMENTLIST = 0x04F9
COMMENTLIST_END = 0x04FA
COMMENT = 0x04FB
COMMENT_END = 0x04FC
TEXT = 0x04FD
# Table records
AUTOFILTER = 0x01A1
AUTOFILTER_END = 0x01A2
FILTERCOLUMN = 0x01A3
FILTERCOLUMN_END = 0x01A4
FILTERS = 0x01A5
FILTERS_END = 0x01A6
FILTER = 0x01A7
TABLE = 0x02D7
TABLE_END = 0x02D8
TABLECOLUMNS = 0x02D9
TABLECOLUMNS_END = 0x02DA
TABLECOLUMN = 0x02DB
TABLECOLUMN_END = 0x02DC
TABLESTYLEINFO = 0x0481
SORTSTATE = 0x0492
SORTCONDITION = 0x0494
SORTSTATE_END = 0x0495
# QueryTable records
QUERYTABLE = 0x03BF
QUERYTABLE_END = 0x03C0
QUERYTABLEREFRESH = 0x03C1
QUERYTABLEREFRESH_END = 0x03C2
QUERYTABLEFIELDS = 0x03C7
QUERYTABLEFIELDS_END = 0x03C8
QUERYTABLEFIELD = 0x03C9
QUERYTABLEFIELD_END = 0x03CA
# Connection records
CONNECTIONS = 0x03AD
CONNECTIONS_END = 0x03AE
CONNECTION = 0x01C9
CONNECTION_END = 0x01CA
DBPR = 0x01CB
DBPR_END = 0x01CC

View File

@@ -0,0 +1,143 @@
from . import biff12
from collections import namedtuple
class Handler(object):
def __init__(self):
super(Handler, self).__init__()
def read(self, reader, recid, reclen):
if reclen > 0:
reader.skip(reclen)
class BasicHandler(Handler):
def __init__(self, name=None):
super(BasicHandler, self).__init__()
self.name = name
def read(self, reader, recid, reclen):
super(BasicHandler, self).read(reader, recid, reclen)
return self.name
class StringTableHandler(Handler):
cls = namedtuple('sst', ['count', 'uniqueCount'])
def __init__(self):
super(StringTableHandler, self).__init__()
def read(self, reader, recid, reclen):
count = reader.read_int()
unique = reader.read_int()
return self.cls._make([count, unique])
class StringInstanceHandler(Handler):
cls = namedtuple('si', ['t'])
def __init__(self):
super(StringInstanceHandler, self).__init__()
def read(self, reader, recid, reclen):
reader.skip(1)
val = reader.read_string()
return self.cls._make([val])
class SheetHandler(Handler):
cls = namedtuple('sheet', ['sheetId', 'rId', 'name'])
def __init__(self):
super(SheetHandler, self).__init__()
def read(self, reader, recid, reclen):
reader.skip(4)
sheetid = reader.read_int()
relid = reader.read_string()
name = reader.read_string()
return self.cls._make([sheetid, relid, name])
class DimensionHandler(Handler):
cls = namedtuple('dimension', ['r', 'c', 'h', 'w'])
def __init__(self):
super(DimensionHandler, self).__init__()
def read(self, reader, recid, reclen):
r1 = reader.read_int()
r2 = reader.read_int()
c1 = reader.read_int()
c2 = reader.read_int()
return self.cls._make([r1, c1, r2 - r1 + 1, c2 - c1 + 1])
class ColumnHandler(Handler):
cls = namedtuple('col', ['c1', 'c2', 'width', 'style'])
def __init__(self):
super(ColumnHandler, self).__init__()
def read(self, reader, recid, reclen):
c1 = reader.read_int()
c2 = reader.read_int()
width = reader.read_int() / 256
style = reader.read_int()
return self.cls._make([c1, c2, width, style])
class RowHandler(Handler):
cls = namedtuple('row', ['r'])
def __init__(self):
super(RowHandler, self).__init__()
def read(self, reader, recid, reclen):
r = reader.read_int()
return self.cls._make([r])
class CellHandler(Handler):
cls = namedtuple('c', ['c', 'v', 'f', 'style'])
def __init__(self):
super(CellHandler, self).__init__()
def read(self, reader, recid, reclen):
col = reader.read_int()
style = reader.read_int()
val = None
if recid == biff12.NUM:
val = reader.read_float()
elif recid == biff12.BOOLERR:
val = hex(reader.read_byte())
elif recid == biff12.BOOL:
val = reader.read_byte() != 0
elif recid == biff12.FLOAT:
val = reader.read_double()
elif recid == biff12.STRING:
val = reader.read_int()
elif recid == biff12.FORMULA_STRING:
val = reader.read_string()
elif recid == biff12.FORMULA_FLOAT:
val = reader.read_double()
elif recid == biff12.FORMULA_BOOL:
val = reader.read_byte() != 0
elif recid == biff12.FORMULA_BOOLERR:
val = hex(reader.read_byte())
return self.cls._make([col, val, None, style])
class HyperlinkHandler(Handler):
cls = namedtuple('hyperlink', ['r', 'c', 'h', 'w', 'rId'])
def __init__(self):
super(HyperlinkHandler, self).__init__()
def read(self, reader, recid, reclen):
r1 = reader.read_int()
r2 = reader.read_int()
c1 = reader.read_int()
c2 = reader.read_int()
rId = reader.read_string()
return self.cls._make([r1, c1, r2 - r1 + 1, c2 - c1 + 1, rId])

View File

@@ -0,0 +1,187 @@
import io
import os
import struct
from . import biff12
from .handlers import *
uint8_t = struct.Struct('<B')
uint16_t = struct.Struct('<H')
int32_t = struct.Struct('<i')
uint32_t = struct.Struct('<I')
double_t = struct.Struct('<d')
class RecordReader(object):
def __init__(self, buf, enc='utf-16'):
self._fp = io.BytesIO(buf)
self._enc = enc
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self._fp.close()
def tell(self):
return self._fp.tell()
def seek(self, offset, whence=os.SEEK_SET):
self._fp.seek(offset, whence)
def skip(self, size):
self._fp.seek(size, os.SEEK_CUR)
def read(self, size):
return self._fp.read(size)
def read_int(self):
buff = self._fp.read(4)
if len(buff) < 4:
return None
return uint32_t.unpack(buff)[0]
def read_short(self):
buff = self._fp.read(2)
if len(buff) < 2:
return None
return uint16_t.unpack(buff)[0]
def read_byte(self):
byte = self._fp.read(1)
if not byte:
return None
return uint8_t.unpack(byte)[0]
def read_float(self):
buff = self._fp.read(4)
if len(buff) < 4:
return None
v = 0.0
intval = int32_t.unpack(buff)[0]
if intval & 0x02 != 0:
v = float(intval >> 2)
else:
v = double_t.unpack(b'\x00\x00\x00\x00' + uint32_t.pack(intval & 0xFFFFFFFC))[0]
if intval & 0x01 != 0:
v /= 100
return v
def read_double(self):
buff = self._fp.read(8)
if len(buff) < 8:
return None
return double_t.unpack(buff)[0]
def read_string(self):
l = self.read_int()
if l is None:
return None
buff = self.read(l * 2)
if len(buff) < l * 2:
return None
return buff.decode(self._enc, errors='replace')
class BIFF12Reader(object):
handlers = {
# Workbook part handlers
biff12.WORKBOOK: BasicHandler('workbook'),
biff12.SHEETS: BasicHandler('sheets'),
biff12.SHEETS_END: BasicHandler('/sheets'),
biff12.SHEET: SheetHandler(),
# SharedStrings part handlers
biff12.SST: StringTableHandler(),
biff12.SST_END: BasicHandler('/sst'),
biff12.SI: StringInstanceHandler(),
# Worksheet part handlers
biff12.WORKSHEET: BasicHandler('worksheet'),
biff12.WORKSHEET_END: BasicHandler('/worksheet'),
biff12.DIMENSION: DimensionHandler(),
biff12.SHEETDATA: BasicHandler('sheetData'),
biff12.SHEETDATA_END: BasicHandler('/sheetData'),
biff12.COLS: BasicHandler('cols'),
biff12.COLS_END: BasicHandler('/cols'),
biff12.COL: ColumnHandler(),
biff12.ROW: RowHandler(),
biff12.BLANK: CellHandler(),
biff12.NUM: CellHandler(),
biff12.BOOLERR: CellHandler(),
biff12.BOOL: CellHandler(),
biff12.FLOAT: CellHandler(),
biff12.STRING: CellHandler(),
biff12.FORMULA_STRING: CellHandler(),
biff12.FORMULA_FLOAT: CellHandler(),
biff12.FORMULA_BOOL: CellHandler(),
biff12.FORMULA_BOOLERR: CellHandler(),
biff12.HYPERLINK: HyperlinkHandler()
}
def __init__(self, fp, debug=False):
super(BIFF12Reader, self).__init__()
self._debug = debug
self._fp = fp
def __iter__(self):
return self
def __next__(self):
return self.next()
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.close()
def tell(self):
return self._fp.tell()
def seek(self, offset, whence=os.SEEK_SET):
self._fp.seek(offset, whence)
def read_id(self):
v = 0
for i in range(4):
byte = self._fp.read(1)
if not byte:
return None
byte = uint8_t.unpack(byte)[0]
v += byte << 8 * i
if byte & 0x80 == 0:
break
return v
def read_len(self):
v = 0
for i in range(4):
byte = self._fp.read(1)
if not byte:
return None
byte = uint8_t.unpack(byte)[0]
v += (byte & 0x7F) << (7 * i)
if byte & 0x80 == 0:
break
return v
def register_handler(self, recid, handler):
self.handlers[recid] = handler
def next(self):
ret = None
while ret is None:
if self._debug:
pos = self._fp.tell()
recid = self.read_id()
reclen = self.read_len()
if recid is None or reclen is None:
raise StopIteration
recdata = self._fp.read(reclen)
with RecordReader(recdata) as reader:
ret = (self.handlers.get(recid) or Handler()).read(reader, recid, reclen)
if self._debug:
print('{:08X} {:04X} {:<6} {} {}'.format(pos, recid, reclen, ' '.join('{:02X}'.format(b) for b in recdata), ret))
return (recid, ret)
def close(self):
self._fp.close()

View File

@@ -0,0 +1,31 @@
from . import biff12
from .reader import BIFF12Reader
class StringTable(object):
def __init__(self, fp):
super(StringTable, self).__init__()
self._reader = BIFF12Reader(fp=fp)
self._strings = []
self._parse()
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.close()
def __getitem__(self, key):
return self._strings[key]
def _parse(self):
for item in self._reader:
if item[0] == biff12.SI:
self._strings.append(item[1].t)
elif item[0] == biff12.SST_END:
break
def get_string(self, idx):
return self._strings[idx]
def close(self):
self._reader.close()

View File

@@ -0,0 +1,88 @@
import os
import sys
import xml.etree.ElementTree as ET
from . import biff12
from .reader import BIFF12Reader
from .stringtable import StringTable
from .worksheet import Worksheet
from tempfile import TemporaryFile
if sys.version_info > (3,):
basestring = (str, bytes)
class Workbook(object):
def __init__(self, fp, debug=False):
super(Workbook, self).__init__()
self._zf = fp
self._debug = debug
self._sheets = []
self.stringtable = None
self._parse()
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.close()
@property
def sheets(self):
return [v[0] for v in self._sheets]
def _parse(self):
rels = {}
with self._zf.open('xl/_rels/workbook.bin.rels', 'r') as zf:
for el in ET.parse(zf).getroot():
rels[el.attrib['Id']] = el.attrib['Target']
with TemporaryFile() as temp:
with self._zf.open('xl/workbook.bin', 'r') as zf:
temp.write(zf.read())
temp.seek(0, os.SEEK_SET)
reader = BIFF12Reader(fp=temp, debug=self._debug)
for item in reader:
if item[0] == biff12.SHEET:
self._sheets.append((item[1].name, rels[item[1].rId]))
elif item[0] == biff12.SHEETS_END:
break
try:
temp = TemporaryFile()
with self._zf.open('xl/sharedStrings.bin', 'r') as zf:
temp.write(zf.read())
temp.seek(0, os.SEEK_SET)
self.stringtable = StringTable(fp=temp)
except KeyError:
temp.close()
except Exception:
temp.close()
raise
def get_sheet(self, idx, rels=False):
if isinstance(idx, basestring):
idx = [s.lower() for s, _ in self._sheets].index(idx.lower()) + 1
if idx < 1 or idx > len(self._sheets):
raise IndexError('sheet index out of range')
name = self._sheets[idx - 1][0]
target = self._sheets[idx - 1][1].split('/')
temp = TemporaryFile()
with self._zf.open('xl/{}/{}'.format(target[0], target[-1]), 'r') as zf:
temp.write(zf.read())
temp.seek(0, os.SEEK_SET)
if rels:
rels_temp = TemporaryFile()
with self._zf.open('xl/{}/_rels/{}.rels'.format(target[0], target[-1]), 'r') as zf:
rels_temp.write(zf.read())
rels_temp.seek(0, os.SEEK_SET)
else:
rels_temp = None
return Worksheet(name=name, fp=temp, rels_fp=rels_temp, stringtable=self.stringtable, debug=self._debug)
def close(self):
self._zf.close()
if self.stringtable is not None:
self.stringtable.close()

View File

@@ -0,0 +1,83 @@
import os
import sys
import xml.etree.ElementTree as ET
from . import biff12
from .reader import BIFF12Reader
from collections import namedtuple
if sys.version_info > (3,):
xrange = range
Cell = namedtuple('Cell', ['r', 'c', 'v'])
class Worksheet(object):
def __init__(self, name, fp, rels_fp=None, stringtable=None, debug=False):
super(Worksheet, self).__init__()
self.name = name
self._reader = BIFF12Reader(fp=fp, debug=debug)
self._rels_fp = rels_fp
self._rels = ET.parse(rels_fp).getroot() if rels_fp is not None else None
self._stringtable = stringtable
self._data_offset = 0
self.dimension = None
self.cols = []
self.rels = {}
self.hyperlinks = {}
self._parse()
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.close()
def __iter__(self):
return self.rows()
def _parse(self):
if self._rels is not None:
for el in self._rels:
self.rels[el.attrib['Id']] = el.attrib['Target']
for item in self._reader:
if item[0] == biff12.DIMENSION:
self.dimension = item[1]
elif item[0] == biff12.COL:
self.cols.append(item[1])
elif item[0] == biff12.SHEETDATA:
self._data_offset = self._reader.tell()
if self._rels is None:
break
elif item[0] == biff12.HYPERLINK and self._rels is not None:
for r in xrange(item[1].h):
for c in xrange(item[1].w):
self.hyperlinks[item[1].r + r, item[1].c + c] = item[1].rId
def rows(self, sparse=False):
self._reader.seek(self._data_offset, os.SEEK_SET)
row_num = -1
row = None
for item in self._reader:
if item[0] == biff12.ROW and item[1].r != row_num:
if row is not None:
yield row
if not sparse:
while row_num < item[1].r - 1:
row_num += 1
yield [Cell(row_num, i, None) for i in xrange(self.dimension.c + self.dimension.w)]
row_num = item[1].r
row = [Cell(row_num, i, None) for i in xrange(self.dimension.c + self.dimension.w)]
elif item[0] >= biff12.BLANK and item[0] <= biff12.FORMULA_BOOLERR:
if item[0] == biff12.STRING and self._stringtable is not None:
row[item[1].c] = Cell(row_num, item[1].c, self._stringtable[item[1].v])
else:
row[item[1].c] = Cell(row_num, item[1].c, item[1].v)
elif item[0] == biff12.SHEETDATA_END:
if row is not None:
yield row
break
def close(self):
self._reader.close()
if self._rels_fp is not None:
self._rels_fp.close()