From bd9b64be581a772513391b1758cfa1e78ab01473 Mon Sep 17 00:00:00 2001 From: Heinrich Lukas Weil Date: Fri, 2 Feb 2024 01:01:35 +0100 Subject: [PATCH] first draft of experimental ziparchive xlsx reader --- .../FsSpreadsheet.ExcelIO.fsproj | 1 + src/FsSpreadsheet.ExcelIO/ZipArchiveReader.fs | 340 ++++++++++++++++++ .../FsSpreadsheet.ExcelIO.Tests.fsproj | 1 + .../ZipArchiveReader.fs | 42 +++ 4 files changed, 384 insertions(+) create mode 100644 src/FsSpreadsheet.ExcelIO/ZipArchiveReader.fs create mode 100644 tests/FsSpreadsheet.ExcelIO.Tests/ZipArchiveReader.fs diff --git a/src/FsSpreadsheet.ExcelIO/FsSpreadsheet.ExcelIO.fsproj b/src/FsSpreadsheet.ExcelIO/FsSpreadsheet.ExcelIO.fsproj index d4f73109..c73a10e5 100644 --- a/src/FsSpreadsheet.ExcelIO/FsSpreadsheet.ExcelIO.fsproj +++ b/src/FsSpreadsheet.ExcelIO/FsSpreadsheet.ExcelIO.fsproj @@ -29,6 +29,7 @@ + diff --git a/src/FsSpreadsheet.ExcelIO/ZipArchiveReader.fs b/src/FsSpreadsheet.ExcelIO/ZipArchiveReader.fs new file mode 100644 index 00000000..bff7ba83 --- /dev/null +++ b/src/FsSpreadsheet.ExcelIO/ZipArchiveReader.fs @@ -0,0 +1,340 @@ +module FsSpreadsheet.ExcelIO.ZipArchiveReader + + +open System.IO.Compression +open System.Xml + +open FsSpreadsheet + +let inline bool (v : string) = + match v with + | null | "0" -> false + | "1" -> true + | _ -> + let v' = v.ToLower() + if v' = "true" then true + elif v' = "false" then false + else failwith "Invalid boolean value" + + +module DType = + [] + let boolean = "b" + [] + let number = "n" + [] + let error = "e" + [] + let sharedString = "s" + [] + let string = "4" + [] + let inlineString = "5" + [] + let date = "d" + +type Relationship = + { Id : string + Type : string + Target : string } + +type WorkBook = ZipArchive + +type SharedStrings = string [] + +type Relationships = Map + +type NumberFormat = + { Id : int + FormatCode : string } + + static member isDateTime (numberFormat : NumberFormat) = + let format = numberFormat.FormatCode + let input = System.DateTime.Now.ToString(format, System.Globalization.CultureInfo.InvariantCulture) + let dt = System.DateTime.ParseExact( + input, + format, + System.Globalization.CultureInfo.InvariantCulture, + System.Globalization.DateTimeStyles.NoCurrentDateDefault + ) + dt <> Unchecked.defaultof + +and CellFormat = + { + NumberFormatId : int + ApllyNumberFormat : bool + } + + static member dateTimeFormats = Set.ofList [14..22] + static member customFormats = Set.ofList [164 .. 180] + + static member isDateTime (styles : Styles) (cf : CellFormat) = + // if numberformatid is between 14 and 18 it is standard date time format. + // custom formats are given in the range of 164 to 180, all none default date time formats fall in there. + if Set.contains cf.NumberFormatId CellFormat.dateTimeFormats then + true + elif Set.contains cf.NumberFormatId CellFormat.customFormats then + styles.NumberFormats.TryFind cf.NumberFormatId + |> Option.map NumberFormat.isDateTime + |> Option.defaultValue true + else + false + +and Styles = + { + NumberFormats : Map + CellFormats : CellFormat [] + } + + +let parseRelationsships (relationships : ZipArchiveEntry) : Relationships = + try + let relationshipsStream = relationships.Open() + let relationshipsReader = System.Xml.XmlReader.Create(relationshipsStream) + let mutable rels = [||] + while relationshipsReader.Read() do + if relationshipsReader.NodeType = XmlNodeType.Element && relationshipsReader.Name = "Relationship" then + let id = relationshipsReader.GetAttribute("Id") + let typ = relationshipsReader.GetAttribute("Type") + let target = + let t = relationshipsReader.GetAttribute("Target") + if t.StartsWith "xl/" then t + elif t.StartsWith "../" then t.Replace("../","xl/") + else "xl/" + t + rels <- Array.append rels [|{Id = id; Type = typ; Target = target}|] + rels + |> Array.map (fun r -> r.Id, r) |> Map.ofArray + with + | _ -> Map.empty + +let getWbRelationships (wb : WorkBook) = + wb.GetEntry("xl/_rels/workbook.xml.rels") + |> parseRelationsships + +let getWsRelationships (ws : string) (wb : WorkBook) = + wb.GetEntry(ws.Replace("worksheets/","worksheets/_rels/").Replace(".xml",".xml.rels")) + |> parseRelationsships + +let getSharedStrings (wb : WorkBook) : SharedStrings = + try + + let sharedStrings = wb.GetEntry("xl/sharedStrings.xml") + let sharedStringsStream = sharedStrings.Open() + let sharedStringsReader = System.Xml.XmlReader.Create(sharedStringsStream) + [| + while sharedStringsReader.Read() do + if sharedStringsReader.NodeType = System.Xml.XmlNodeType.Element && sharedStringsReader.Name = "si" then + sharedStringsReader.ReadToDescendant("t") |> ignore + yield sharedStringsReader.ReadElementContentAsString() + |] + with + | _ -> [||] + +let getStyles (wb : WorkBook) = + let styles = wb.GetEntry("xl/styles.xml") + let stylesStream = styles.Open() + let stylesReader = System.Xml.XmlReader.Create(stylesStream) + let mutable numberFormats = Map.empty + let mutable cellFormats = Array.empty + while stylesReader.Read() do + if stylesReader.NodeType = XmlNodeType.Element && stylesReader.Name = "numFmts" then + let subReader = stylesReader.ReadSubtree() + let numFmts = + [| + while subReader.Read() do + if subReader.NodeType = XmlNodeType.Element && subReader.Name = "numFmt" then + let id = subReader.GetAttribute("numFmtId") |> int + let formatCode = subReader.GetAttribute("formatCode") + yield id, {Id = id; FormatCode = formatCode} + |] + numberFormats <- Map.ofArray numFmts + if stylesReader.NodeType = XmlNodeType.Element && stylesReader.Name = "cellXfs" then + let subReader = stylesReader.ReadSubtree() + let cellFmts = + [| + while subReader.Read() do + if subReader.NodeType = XmlNodeType.Element && subReader.Name = "xf" then + let numFmtId = subReader.GetAttribute("numFmtId") |> int + let applyNumberFormat = subReader.GetAttribute("applyNumberFormat") |> bool + yield {NumberFormatId = numFmtId; ApllyNumberFormat = applyNumberFormat} + |] + cellFormats <- cellFmts + {NumberFormats = numberFormats; CellFormats = cellFormats} + + +let parseTable (sheet : ZipArchiveEntry) = + try + let stream = sheet.Open() + let reader = System.Xml.XmlReader.Create(stream) + while reader.Name <> "table" do + reader.Read() |> ignore + let area = reader.GetAttribute("ref") + let ra = FsRangeAddress(area) + let totalsRowShown = + let attribute = reader.GetAttribute("totalsRowShown") + match attribute with + | null + | "0" -> false + | "1" -> true + | _ -> false + let name = + let dn = reader.GetAttribute("displayName") + if dn = null then + reader.GetAttribute("name") + else dn + FsTable(name, ra, totalsRowShown, true) + with + | err -> failwithf "Error while parsing table \"%s\":%s" sheet.FullName err.Message + +//zip.Entries +//|> Seq.map (fun e -> e.Name) +//|> Seq.toArray + +// Apply the parseTable function to every zip entry starting with "xl/tables" +let getTables (wb : WorkBook) = + wb.Entries + |> Seq.choose (fun e -> + if e.FullName.StartsWith("xl/tables") && e.FullName <> "xl/tables" && e.FullName <> "xl/tables/" then + parseTable e + |> Some + else None) + |> Seq.toArray + +let parseDataType (styles : Styles) (dataType : string) (style : string) (formula : string) = + if formula <> null then + // LibreOffice annotates boolean values as formulas instead of boolean datatypes + if formula = "TRUE()" || formula = "FALSE()" then + DataType.Boolean + else + DataType.Number + //https://stackoverflow.com/a/13178043/12858021 + //https://stackoverflow.com/a/55425719/12858021 + // if styleindex is not null and datatype is null we propably have a DateTime field. + // if datatype would not be null it could also be boolean, as far as i tested it ~Kevin F 13.10.2023 + elif style <> null && (dataType = null || dataType = DType.number) then + try + let cellFormat : CellFormat = styles.CellFormats.[int style] + if (*cellFormat <> null &&*) CellFormat.isDateTime styles cellFormat then + DataType.Date + + else + DataType.Number + with + | _ -> DataType.Number + else + match dataType with + | DType.number -> DataType.Number + | DType.boolean -> DataType.Boolean + | DType.date -> DataType.Date + | DType.error -> DataType.Empty + | DType.inlineString + | DType.sharedString + | DType.string -> DataType.String + | _ -> DataType.Number + +let parseCell (sst : SharedStrings) (styles : Styles) (value : string) (dataType : string) (style : string) (formula : string) : obj*DataType = + + let cellValueString = if dataType <> null && dataType = DType.sharedString then sst.[int value] else value + let dt = parseDataType styles dataType style formula + match dt with + | Date -> + try + // datetime is written as float counting days since 1900. + // We use the .NET helper because we really do not want to deal with datetime issues. + System.DateTime.FromOADate(float cellValueString) + with + | _ -> cellValueString + | Boolean -> + // boolean is written as int/float either 0 or null + match cellValueString.ToLower() with + | "1" | "true" -> true + | "0" | "false" -> false + | _ -> cellValueString + | Number -> + try + float cellValueString + with + | _ -> + cellValueString + | Empty | String -> cellValueString + , + dt + +let parseWorksheet (name : string) (styles : Styles) (sharedStrings : SharedStrings) (sheet : ZipArchiveEntry) (wb : WorkBook) = + try + let ws = FsWorksheet(name) + let stream = sheet.Open() + let reader = System.Xml.XmlReader.Create(stream) + let relationships = getWsRelationships sheet.FullName wb + while reader.Read() do + if reader.NodeType = XmlNodeType.Element && reader.Name = "c" then + let r = reader.GetAttribute("r") + let t = reader.GetAttribute("t") + let s = reader.GetAttribute("s") + + let mutable v = null + let mutable f = null + let cellReader = reader.ReadSubtree() + while cellReader.Read() do + if cellReader.NodeType = XmlNodeType.Element && cellReader.Name = "v" then + v <- cellReader.ReadElementContentAsString() + if cellReader.NodeType = XmlNodeType.Element && cellReader.Name = "f" then + f <- cellReader.ReadElementContentAsString() + if v <> null && v <> "" then + let cellValue,dataType = parseCell sharedStrings styles v t s f + let cell = FsCell(cellValue,dataType = dataType,address = FsAddress(r)) + ws.AddCell(cell) |> ignore + if reader.NodeType = XmlNodeType.Element && reader.Name = "tablePart" then + printfn "TablePart" + let id = reader.GetAttribute("r:id") + printfn "ID: %s" id + let rel = relationships.[id] + printf "Target: %s" rel.Target + let table = + wb.GetEntry(rel.Target) + |> parseTable + printfn "tableref: %s" table.RangeAddress.Range + ws.AddTable(table) |> ignore + reader.Close() + ws + with + | err -> failwithf "Error while parsing worksheet \"%s\":%s" name err.Message + +let parseWorkbook (wb : ZipArchive) = + let newWb = new FsWorkbook() + let styles = getStyles wb + let sst = getSharedStrings wb + //let tables = getTables wb + let relationships = getWbRelationships wb + let wbPart = wb.GetEntry("xl/workbook.xml") + let wbStream = wbPart.Open() + let wbReader = System.Xml.XmlReader.Create(wbStream) + while wbReader.Read() do + if wbReader.Name = "sheet" then + let name = wbReader.GetAttribute("name") + let id = wbReader.GetAttribute("r:id") + let rel = relationships.[id] + let sheet = wb.GetEntry(rel.Target) + let ws = parseWorksheet name styles sst sheet wb + ws.RescanRows() + newWb.AddWorksheet (ws) |> ignore + newWb + +module FsWorkbook = + + open System.IO + + let fromZipArchive (wb : ZipArchive) = + parseWorkbook wb + + let fromStream (stream : Stream) = + use zip = new ZipArchive(stream) + fromZipArchive zip + + let fromBytes (bytes : byte []) = + use ms = new MemoryStream(bytes) + fromStream ms + + let fromFile (path : string) = + use fs = File.OpenRead(path) + fromStream fs \ No newline at end of file diff --git a/tests/FsSpreadsheet.ExcelIO.Tests/FsSpreadsheet.ExcelIO.Tests.fsproj b/tests/FsSpreadsheet.ExcelIO.Tests/FsSpreadsheet.ExcelIO.Tests.fsproj index d904c7ba..4dce63d4 100644 --- a/tests/FsSpreadsheet.ExcelIO.Tests/FsSpreadsheet.ExcelIO.Tests.fsproj +++ b/tests/FsSpreadsheet.ExcelIO.Tests/FsSpreadsheet.ExcelIO.Tests.fsproj @@ -7,6 +7,7 @@ + diff --git a/tests/FsSpreadsheet.ExcelIO.Tests/ZipArchiveReader.fs b/tests/FsSpreadsheet.ExcelIO.Tests/ZipArchiveReader.fs new file mode 100644 index 00000000..fa7ba0a2 --- /dev/null +++ b/tests/FsSpreadsheet.ExcelIO.Tests/ZipArchiveReader.fs @@ -0,0 +1,42 @@ +module ZipArchiveReader + +open Expecto +open TestingUtils +open FsSpreadsheet +open FsSpreadsheet.ExcelIO.ZipArchiveReader + +let tests_Read = testList "Read" [ + let readFromTestFile (testFile: DefaultTestObject.TestFiles) = + try + FsWorkbook.fromFile(testFile.asRelativePath) + with + | _ -> FsWorkbook.fromFile($"{DefaultTestObject.testFolder}/{testFile.asFileName}") + + testCase "FsCell equality" <| fun _ -> + let c1 = FsCell(1, DataType.Number, FsAddress("A2")) + let c2 = FsCell(1, DataType.Number, FsAddress("A2")) + let isStructEqual = c1.StructurallyEquals(c2) + Expect.isTrue isStructEqual "" + testCase "Excel" <| fun _ -> + let wb = readFromTestFile DefaultTestObject.TestFiles.Excel + Expect.isDefaultTestObject wb + testCase "Libre" <| fun _ -> + let wb = readFromTestFile DefaultTestObject.TestFiles.Libre + Expect.isDefaultTestObject wb + testCase "FableExceljs" <| fun _ -> + let wb = readFromTestFile DefaultTestObject.TestFiles.FableExceljs + Expect.isDefaultTestObject wb + testCase "ClosedXML" <| fun _ -> + let wb = readFromTestFile DefaultTestObject.TestFiles.ClosedXML + Expect.isDefaultTestObject wb + testCase "FsSpreadsheet" <| fun _ -> + let wb = readFromTestFile DefaultTestObject.TestFiles.FsSpreadsheetNET + wb.GetWorksheets().[0].GetCellAt(5,1) |> fun x -> (x.Value, x.DataType) |> printfn "%A" + Expect.isDefaultTestObject wb +] + + +[] +let main = testList "ZipArchiveReader" [ + tests_Read +]