diff --git a/src/FsSpreadsheet.ExcelIO/FsSpreadsheet.ExcelIO.fsproj b/src/FsSpreadsheet.ExcelIO/FsSpreadsheet.ExcelIO.fsproj index d4f73109..c73a10e5 100644 --- a/src/FsSpreadsheet.ExcelIO/FsSpreadsheet.ExcelIO.fsproj +++ b/src/FsSpreadsheet.ExcelIO/FsSpreadsheet.ExcelIO.fsproj @@ -29,6 +29,7 @@ + diff --git a/src/FsSpreadsheet.ExcelIO/ZipArchiveReader.fs b/src/FsSpreadsheet.ExcelIO/ZipArchiveReader.fs new file mode 100644 index 00000000..9f48e3ed --- /dev/null +++ b/src/FsSpreadsheet.ExcelIO/ZipArchiveReader.fs @@ -0,0 +1,338 @@ +module FsSpreadsheet.ExcelIO.ZipArchiveReader + +open System.IO.Compression +open System.Xml + +open FsSpreadsheet + +let inline bool (v : string) = + match v with + | null | "0" -> false + | "1" -> true + | _ -> + let v' = v.ToLower() + if v' = "true" then true + elif v' = "false" then false + else failwith "Invalid boolean value" + + +module DType = + [] + let boolean = "b" + [] + let number = "n" + [] + let error = "e" + [] + let sharedString = "s" + [] + let string = "str" + [] + let inlineString = "5" + [] + let date = "d" + +type Relationship = + { Id : string + Type : string + Target : string } + +type WorkBook = ZipArchive + +type SharedStrings = string [] + +type Relationships = Map + +let dateTimeFormats = Set.ofList [14..22] +let customFormats = Set.ofList [164 .. 180] + + +type NumberFormat = + { Id : int + FormatCode : string } + + static member isDateTime (numberFormat : NumberFormat) = + let format = numberFormat.FormatCode + let input = System.DateTime.Now.ToString(format, System.Globalization.CultureInfo.InvariantCulture) + let dt = System.DateTime.ParseExact( + input, + format, + System.Globalization.CultureInfo.InvariantCulture, + System.Globalization.DateTimeStyles.NoCurrentDateDefault + ) + dt <> Unchecked.defaultof + +and CellFormat = + { + NumberFormatId : int + ApllyNumberFormat : bool + } + + static member isDateTime (styles : Styles) (cf : CellFormat) = + // if numberformatid is between 14 and 18 it is standard date time format. + // custom formats are given in the range of 164 to 180, all none default date time formats fall in there. + if Set.contains cf.NumberFormatId dateTimeFormats then + true + elif Set.contains cf.NumberFormatId customFormats then + styles.NumberFormats.TryFind cf.NumberFormatId + |> Option.map NumberFormat.isDateTime + |> Option.defaultValue true + else + false + +and Styles = + { + NumberFormats : Map + CellFormats : CellFormat [] + } + + +module XmlReader = + let isElemWithName (reader : System.Xml.XmlReader) (name : string) = + reader.NodeType = XmlNodeType.Element && (reader.Name = name || reader.Name = "x:" + name) + +let parseRelationsships (relationships : ZipArchiveEntry) : Relationships = + try + use relationshipsStream = relationships.Open() + use relationshipsReader = System.Xml.XmlReader.Create(relationshipsStream) + let mutable rels = [||] + while relationshipsReader.Read() do + if XmlReader.isElemWithName relationshipsReader "Relationship" then + let id = relationshipsReader.GetAttribute("Id") + let typ = relationshipsReader.GetAttribute("Type") + let target = + let t = relationshipsReader.GetAttribute("Target") + if t.StartsWith "xl/" then t + elif t.StartsWith "/xl/" then t.Replace("/xl/","xl/") + elif t.StartsWith "../" then t.Replace("../","xl/") + else "xl/" + t + rels <- Array.append rels [|{Id = id; Type = typ; Target = target}|] + rels + |> Array.map (fun r -> r.Id, r) |> Map.ofArray + with + | _ -> Map.empty + +let getWbRelationships (wb : WorkBook) = + wb.GetEntry("xl/_rels/workbook.xml.rels") + |> parseRelationsships + +let getWsRelationships (ws : string) (wb : WorkBook) = + wb.GetEntry(ws.Replace("worksheets/","worksheets/_rels/").Replace(".xml",".xml.rels")) + |> parseRelationsships + +let getSharedStrings (wb : WorkBook) : SharedStrings = + try + + let sharedStrings = wb.GetEntry("xl/sharedStrings.xml") + use sharedStringsStream = sharedStrings.Open() + use sharedStringsReader = System.Xml.XmlReader.Create(sharedStringsStream) + [| + while sharedStringsReader.Read() do + if XmlReader.isElemWithName sharedStringsReader "si" then + use subReader = sharedStringsReader.ReadSubtree() + while subReader.Read() do + if XmlReader.isElemWithName subReader "t" then + yield subReader.ReadElementContentAsString() + |] + with + | _ -> [||] + +let getStyles (wb : WorkBook) = + let styles = wb.GetEntry("xl/styles.xml") + use stylesStream = styles.Open() + use stylesReader = System.Xml.XmlReader.Create(stylesStream) + let mutable numberFormats = Map.empty + let mutable cellFormats = Array.empty + while stylesReader.Read() do + if XmlReader.isElemWithName stylesReader "numFmts" then + use subReader = stylesReader.ReadSubtree() + let numFmts = + [| + while subReader.Read() do + if XmlReader.isElemWithName subReader "numFmt" then + let id = subReader.GetAttribute("numFmtId") |> int + let formatCode = subReader.GetAttribute("formatCode") + yield id, {Id = id; FormatCode = formatCode} + |] + numberFormats <- Map.ofArray numFmts + if XmlReader.isElemWithName stylesReader "cellXfs" then + use subReader = stylesReader.ReadSubtree() + let cellFmts = + [| + while subReader.Read() do + if XmlReader.isElemWithName subReader "xf" then + let numFmtId = subReader.GetAttribute("numFmtId") |> int + let applyNumberFormat = subReader.GetAttribute("applyNumberFormat") |> bool + yield {NumberFormatId = numFmtId; ApllyNumberFormat = applyNumberFormat} + |] + cellFormats <- cellFmts + {NumberFormats = numberFormats; CellFormats = cellFormats} + + +let parseTable (sheet : ZipArchiveEntry) = + try + use stream = sheet.Open() + use reader = System.Xml.XmlReader.Create(stream) + let mutable t = None + while reader.Read() do + if XmlReader.isElemWithName reader "table" then + let area = reader.GetAttribute("ref") + let ra = FsRangeAddress(area) + let totalsRowShown = + let attribute = reader.GetAttribute("totalsRowShown") + match attribute with + | null + | "0" -> false + | "1" -> true + | _ -> false + let name = + let dn = reader.GetAttribute("displayName") + if dn = null then + reader.GetAttribute("name") + else dn + t <- Some (FsTable(name, ra, totalsRowShown, true)) + if t.IsNone then + failwith "No table found" + else + t.Value + with + | err -> failwithf "Error while parsing table \"%s\":%s" sheet.FullName err.Message + +//zip.Entries +//|> Seq.map (fun e -> e.Name) +//|> Seq.toArray + +// Apply the parseTable function to every zip entry starting with "xl/tables" +let getTables (wb : WorkBook) = + wb.Entries + |> Seq.choose (fun e -> + if e.FullName.StartsWith("xl/tables") && e.FullName <> "xl/tables" && e.FullName <> "xl/tables/" then + parseTable e + |> Some + else None) + |> Seq.toArray + +let parseCell (sst : SharedStrings) (styles : Styles) (value : string) (dataType : string) (style : string) (formula : string) : obj*DataType = + // LibreOffice annotates boolean values as formulas instead of boolean datatypes + if formula <> null && formula = "TRUE()" then + true,DataType.Boolean + elif formula <> null && formula = "FALSE()" then + false,DataType.Boolean + else + let cellValueString = if dataType <> null && dataType = DType.sharedString then sst.[int value] else value + //https://stackoverflow.com/a/13178043/12858021 + //https://stackoverflow.com/a/55425719/12858021 + // if styleindex is not null and datatype is null we propably have a DateTime field. + // if datatype would not be null it could also be boolean, as far as i tested it ~Kevin F 13.10.2023 + if style <> null && (dataType = null || dataType = DType.number) then + try + let cellFormat : CellFormat = styles.CellFormats.[int style] + if (*cellFormat <> null &&*) CellFormat.isDateTime styles cellFormat then + System.DateTime.FromOADate(float cellValueString), DataType.Date + else + float value, DataType.Number + with + | _ -> value, DataType.Number + else + match dataType with + | DType.boolean -> + match cellValueString.ToLower() with + | "1" | "true" -> true + | "0" | "false" -> false + | _ -> cellValueString + , + DataType.Boolean + | DType.date -> + try + // datetime is written as float counting days since 1900. + // We use the .NET helper because we really do not want to deal with datetime issues. + System.DateTime.FromOADate(float cellValueString), DataType.Date + with + | _ -> cellValueString, DataType.Date + | DType.error -> cellValueString, DataType.Empty + | DType.inlineString + | DType.sharedString + | DType.string -> cellValueString, DataType.String + | _ -> + try + float cellValueString + with + | _ -> cellValueString + , + DataType.Number + +let parseWorksheet (name : string) (styles : Styles) (sharedStrings : SharedStrings) (sheet : ZipArchiveEntry) (wb : WorkBook) = + try + let ws = FsWorksheet(name) + let relationships = getWsRelationships sheet.FullName wb + use stream = sheet.Open() + use reader = System.Xml.XmlReader.Create(stream) + while reader.Read() do + if XmlReader.isElemWithName reader "c" then + let r = reader.GetAttribute("r") + let t = reader.GetAttribute("t") + let s = reader.GetAttribute("s") + + let mutable v = null + let mutable f = null + use cellReader = reader.ReadSubtree() + while cellReader.Read() do + if XmlReader.isElemWithName cellReader "v" then + v <- cellReader.ReadElementContentAsString() + if XmlReader.isElemWithName cellReader "f" then + f <- cellReader.ReadElementContentAsString() + if v <> null && v <> "" || f <> null then + let cellValue,dataType = parseCell sharedStrings styles v t s f + let cell = FsCell(cellValue,dataType = dataType,address = FsAddress(r)) + ws.AddCell(cell) |> ignore + if XmlReader.isElemWithName reader "tablePart" then + let id = reader.GetAttribute("r:id") + let rel = relationships.[id] + let table = + wb.GetEntry(rel.Target) + |> parseTable + ws.AddTable(table) |> ignore + reader.Close() + ws + with + | err -> failwithf "Error while parsing worksheet \"%s\":%s" name err.Message + +let parseWorkbook (wb : ZipArchive) = + let newWb = new FsWorkbook() + let styles = getStyles wb + let sst = getSharedStrings wb + //let tables = getTables wb + let relationships = getWbRelationships wb + let wbPart = wb.GetEntry("xl/workbook.xml") + use wbStream = wbPart.Open() + use wbReader = System.Xml.XmlReader.Create(wbStream) + while wbReader.Read() do + if XmlReader.isElemWithName wbReader "sheet" then + let name = wbReader.GetAttribute("name") + let id = wbReader.GetAttribute("r:id") + let rel = relationships.[id] + let sheet = wb.GetEntry(rel.Target) + let ws = parseWorksheet name styles sst sheet wb + ws.RescanRows() + newWb.AddWorksheet (ws) |> ignore + newWb + + +module FsWorkbook = + + open System.IO + + let fromZipArchive (wb : ZipArchive) = + parseWorkbook wb + + let fromStream (stream : Stream) = + use zip = new ZipArchive(stream) + fromZipArchive zip + + let fromBytes (bytes : byte []) = + use ms = new MemoryStream(bytes) + fromStream ms + + let fromFile (path : string) = + use fs = File.OpenRead(path) + fromStream fs \ No newline at end of file diff --git a/src/FsSpreadsheet/FsAddress.fs b/src/FsSpreadsheet/FsAddress.fs index 26e2348e..872d28bd 100644 --- a/src/FsSpreadsheet/FsAddress.fs +++ b/src/FsSpreadsheet/FsAddress.fs @@ -37,16 +37,24 @@ module CellReference = let ofIndices column (row : uint32) = sprintf "%s%i" (indexToColAdress column) row + + /// Maps a "A1" style excel cell reference to a column * row index tuple (1 Based indices). let toIndices (reference : string) = - let inp = reference.ToUpper() - let regex = indexRegex.Match(inp) + let charPart = System.Text.StringBuilder() + let numPart = System.Text.StringBuilder() - if regex.Success then - regex.Groups - |> fun a -> colAdressToIndex a.[1].Value, uint32 a.[2].Value - else - failwithf "Reference %s does not match Excel A1-style" reference + reference + |> Seq.iter (fun c -> + if System.Char.IsLetter c then + charPart.Append c |> ignore + elif System.Char.IsDigit c then + numPart.Append c |> ignore + else + failwithf "Reference %s does not match Excel A1-style" reference + ) + colAdressToIndex (charPart.ToString()), uint32 (numPart.ToString()) + /// Maps a "A1" style excel cell reference to a column (1 Based indices). let toColIndex (reference : string) = diff --git a/src/FsSpreadsheet/FsWorksheet.fs b/src/FsSpreadsheet/FsWorksheet.fs index 1473c198..c787e99c 100644 --- a/src/FsSpreadsheet/FsWorksheet.fs +++ b/src/FsSpreadsheet/FsWorksheet.fs @@ -366,13 +366,19 @@ type FsWorksheet (name, ?fsRows, ?fsTables, ?fsCellsCollection) = _cells.GetCells() |> Seq.groupBy (fun c -> c.RowNumber) |> Seq.iter (fun (rowIndex,cells) -> + let mutable min = 1 + let mutable max = 1 + cells + |> Seq.iter (fun c -> + let cn = c.ColumnNumber + if cn < min then min <- cn + if cn > max then max <- cn + ) let newRange = - cells - |> Seq.sortBy (fun c -> c.ColumnNumber) - |> fun cells -> - FsAddress(rowIndex,Seq.head cells |> fun c -> c.ColumnNumber), - FsAddress(rowIndex,Seq.last cells |> fun c -> c.ColumnNumber) - |> FsRangeAddress + FsRangeAddress( + FsAddress(rowIndex,min), + FsAddress(rowIndex,max) + ) match Map.tryFind rowIndex rows with | Some row -> row.RangeAddress <- newRange diff --git a/tests/FsSpreadsheet.ExcelIO.Tests/FsSpreadsheet.ExcelIO.Tests.fsproj b/tests/FsSpreadsheet.ExcelIO.Tests/FsSpreadsheet.ExcelIO.Tests.fsproj index d904c7ba..4dce63d4 100644 --- a/tests/FsSpreadsheet.ExcelIO.Tests/FsSpreadsheet.ExcelIO.Tests.fsproj +++ b/tests/FsSpreadsheet.ExcelIO.Tests/FsSpreadsheet.ExcelIO.Tests.fsproj @@ -7,6 +7,7 @@ + diff --git a/tests/FsSpreadsheet.ExcelIO.Tests/ZipArchiveReader.fs b/tests/FsSpreadsheet.ExcelIO.Tests/ZipArchiveReader.fs new file mode 100644 index 00000000..8ac28337 --- /dev/null +++ b/tests/FsSpreadsheet.ExcelIO.Tests/ZipArchiveReader.fs @@ -0,0 +1,52 @@ +module ZipArchiveReader + +open TestingUtils +open FsSpreadsheet +open FsSpreadsheet.ExcelIO.ZipArchiveReader + +let tests_Read = testList "Read" [ + let readFromTestFile (testFile: DefaultTestObject.TestFiles) = + try + FsWorkbook.fromFile(testFile.asRelativePath) + with + | _ -> FsWorkbook.fromFile($"{DefaultTestObject.testFolder}/{testFile.asFileName}") + + testCase "FsCell equality" <| fun _ -> + let c1 = FsCell(1, DataType.Number, FsAddress("A2")) + let c2 = FsCell(1, DataType.Number, FsAddress("A2")) + let isStructEqual = c1.StructurallyEquals(c2) + Expect.isTrue isStructEqual "" + testCase "Excel" <| fun _ -> + let wb = readFromTestFile DefaultTestObject.TestFiles.Excel + Expect.isDefaultTestObject wb + testCase "Libre" <| fun _ -> + let wb = readFromTestFile DefaultTestObject.TestFiles.Libre + Expect.isDefaultTestObject wb + testCase "FableExceljs" <| fun _ -> + let wb = readFromTestFile DefaultTestObject.TestFiles.FableExceljs + Expect.isDefaultTestObject wb + testCase "ClosedXML" <| fun _ -> + let wb = readFromTestFile DefaultTestObject.TestFiles.ClosedXML + Expect.isDefaultTestObject wb + testCase "FsSpreadsheet" <| fun _ -> + let wb = readFromTestFile DefaultTestObject.TestFiles.FsSpreadsheetNET + wb.GetWorksheets().[0].GetCellAt(5,1) |> fun x -> (x.Value, x.DataType) |> printfn "%A" + Expect.isDefaultTestObject wb +] + +open FsSpreadsheet.ExcelIO + +let performanceTest = testList "Performance" [ + testCase "BigFile" <| fun _ -> + let readF() = FsWorkbook.fromFile("./TestFiles/BigFile.xlsx") |> ignore + let refReadF() = FsWorkbook.fromXlsxFile("./TestFiles/BigFile.xlsx") |> ignore + Expect.isFasterThan readF refReadF "ZipArchiveReader should be faster than standard reader" + //Expect.equal (wb.GetWorksheetAt(1).Rows.Count) 153991 "Row count should be equal" +] + + +[] +let main = testList "ZipArchiveReader" [ + performanceTest + tests_Read +] diff --git a/tests/FsSpreadsheet.Tests/FsWorksheetTests.fs b/tests/FsSpreadsheet.Tests/FsWorksheetTests.fs index e621ed10..0e8c7400 100644 --- a/tests/FsSpreadsheet.Tests/FsWorksheetTests.fs +++ b/tests/FsSpreadsheet.Tests/FsWorksheetTests.fs @@ -38,9 +38,22 @@ let tests_SortRows = testList "SortRows" [ Utils.Expect.mySequenceEqual ws.Rows rows "equal" ] +let tests_rescanRows = testList "RescanRows" [ + testCase "empty" <| fun _ -> + dummySheet1.RescanRows() + Expect.hasLength dummySheet1.Rows 0 "row count" + testCase "rows" <| fun _ -> + let ws = createBigDummySheet() + let rows = ResizeArray(ws.Rows) // create copy + ws.RescanRows() + Utils.Expect.mySequenceEqual ws.Rows rows "equal" +] + + let main = testSequenced <| testList "FsWorksheet" [ tests_SortRows + tests_rescanRows testList "FsCell data" [ // TO DO: Ask TM: useful? or was that a mistake? (since the same test is seen in FsCell.fs) testList "Data | DataType | Adress" [ diff --git a/tests/Speedtest/Program.fs b/tests/Speedtest/Program.fs index c01019f0..acb53bef 100644 --- a/tests/Speedtest/Program.fs +++ b/tests/Speedtest/Program.fs @@ -21,6 +21,20 @@ let main argv = readStudy() |> ignore readInvestigation() |> ignore + let zipArchiveReader() = + + + let readAssay() = ZipArchiveReader.FsWorkbook.fromFile assayPath + let readStudy() = ZipArchiveReader.FsWorkbook.fromFile studyPath + let readInvestigation() = ZipArchiveReader.FsWorkbook.fromFile investigationPath + let bigFile() = ZipArchiveReader.FsWorkbook.fromFile @"C:\Users\HLWei\source\repos\IO\FsSpreadsheet\tests\TestUtils\TestFiles\BigFile.xlsx" + + + readInvestigation() |> ignore + readAssay() |> ignore + readStudy() |> ignore + bigFile() |> ignore + let closedXML() = // Read xlsx file using closedxml @@ -32,8 +46,36 @@ let main argv = readStudy() |> ignore readInvestigation() |> ignore + let randomReadArchives () = - fsSpreadsheet() - closedXML() + let readArchive(p : string) = + let zip = System.IO.Compression.ZipFile.OpenRead(p) + let e1 = zip.GetEntry("xl/worksheets/sheet1.xml") + use stream1 = e1.Open() + use reader1 = System.Xml.XmlReader.Create(stream1) + while reader1.Read() do + () + let e2 = zip.GetEntry("xl/worksheets/sheet2.xml") + use stream2 = e2.Open() + use reader2 = System.Xml.XmlReader.Create(stream2) + while reader2.Read() do + () + let e3 = zip.GetEntry("xl/worksheets/sheet3.xml") + use stream3 = e3.Open() + use reader3 = System.Xml.XmlReader.Create(stream3) + while reader3.Read() do + () + + let readAssay() = readArchive(assayPath) + let readStudy() = readArchive(studyPath) + let readInvestigation() = readArchive(investigationPath) + + readAssay() |> ignore + readStudy() |> ignore + readInvestigation() |> ignore + closedXML() + fsSpreadsheet() + zipArchiveReader() + //randomReadArchives () 1 \ No newline at end of file diff --git a/tests/TestUtils/TestingUtils.fs b/tests/TestUtils/TestingUtils.fs index b854976d..063cf1de 100644 --- a/tests/TestUtils/TestingUtils.fs +++ b/tests/TestUtils/TestingUtils.fs @@ -29,6 +29,22 @@ module Utils = Seq.mapi2 (fun i s p -> i,s,p) s1 s2 |> Seq.find (function |_,Some s,Some p when s=p -> false |_-> true) +open System + +[] +type Stopwatch() = + member val StartTime: DateTime option = None with get, set + member val StopTime: DateTime option = None with get, set + member this.Start() = this.StartTime <- Some DateTime.Now + member this.Stop() = + match this.StartTime with + | Some _ -> this.StopTime <- Some DateTime.Now + | None -> failwith "Error. Unable to call `Stop` before `Start`." + member this.Elapsed : TimeSpan = + match this.StartTime, this.StopTime with + | Some start, Some stop -> stop - start + | _, _ -> failwith "Error. Unable to call `Elapsed` without calling `Start` and `Stop` before." + /// Fable compatible Expecto/Mocha unification module Expect = @@ -57,6 +73,30 @@ module Expect = let comp = Utils.firstDiff actual expected _sequenceEqual message comp + let wantFaster (f : unit -> 'T) (maxMilliseconds : int) (message : string) = + let stopwatch = Stopwatch() + stopwatch.Start() + let res = f() + stopwatch.Stop() + let elapsed = stopwatch.Elapsed + if elapsed.TotalMilliseconds > float maxMilliseconds then + failwithf $"{message}. Expected to be faster than {maxMilliseconds}ms, but took {elapsed.TotalMilliseconds}ms" + res + + let isFasterThan (f1 : unit -> _) (f2 : unit -> _) (message : string) = + let stopwatch = Stopwatch() + stopwatch.Start() + f1() + stopwatch.Stop() + let elapsed1 = stopwatch.Elapsed + stopwatch.Start() + f2() + stopwatch.Stop() + let elapsed2 = stopwatch.Elapsed + if elapsed1.TotalMilliseconds > elapsed2.TotalMilliseconds then + failwithf $"{message}. Expected {elapsed1.TotalMilliseconds}ms to be faster than {elapsed2.TotalMilliseconds}ms" + () + let cellSequenceEquals (actual: FsCell seq) (expected: FsCell seq) message = let cellDiff (s1: FsCell seq) (s2: FsCell seq) = let s1 = Seq.append (Seq.map Some s1) (Seq.initInfinite (fun _ -> None)) @@ -143,18 +183,3 @@ module Test = let testList = testList -open System - -[] -type Stopwatch() = - member val StartTime: DateTime option = None with get, set - member val StopTime: DateTime option = None with get, set - member this.Start() = this.StartTime <- Some DateTime.Now - member this.Stop() = - match this.StartTime with - | Some _ -> this.StopTime <- Some DateTime.Now - | None -> failwith "Error. Unable to call `Stop` before `Start`." - member this.Elapsed : TimeSpan = - match this.StartTime, this.StopTime with - | Some start, Some stop -> stop - start - | _, _ -> failwith "Error. Unable to call `Elapsed` without calling `Start` and `Stop` before." \ No newline at end of file