-
Notifications
You must be signed in to change notification settings - Fork 16
/
File.lua
240 lines (211 loc) · 6.8 KB
/
File.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
----------------------------------------------------------------------
-- csvigo.File
-- A class to manage comma separate value files + two directly-usable functions
-- various function to manage csv files
-- These CSV files all have a comma delimiter and use " as the quote character
-- The separator ',' can be user-defined. A common example is ' ', which allows
-- for space separated values.
-- Ref:
-- http://www.lua.org/pil/20.4.html
-- http://www.torch.ch/manual/torch/utility#torchclass
----------------------------------------------------------------------
-- enclose commas and quotes between quotes and escape original quotes
local function escapeCsv(s, separator)
if string.find(s, '["' .. separator .. ']') then
--if string.find(s, '[,"]') then
s = '"' .. string.gsub(s, '"', '""') .. '"'
end
return s
end
-- convert an array of strings or numbers into a row in a csv file
local function tocsv(t, separator, nan_as_missing)
local s = ""
for _,p in ipairs(t) do
if (nan_as_missing and p ~= p) then
p = ''
end
s = s .. separator .. escapeCsv(p, separator)
end
return string.sub(s, 2) -- remove first comma
end
-- break record from csv file into array of strings
local function fromcsv(s, separator)
if not s then error("s is null") end
s = s .. separator -- end with separator
if separator == ' ' then separator = '%s+' end
local t = {}
local fieldstart = 1
repeat
-- next field is quoted? (starts with "?)
if string.find(s, '^"', fieldstart) then
local a, c
local i = fieldstart
repeat
-- find closing quote
a, i, c = string.find(s, '"("?)', i+1)
until c ~= '"' -- quote not followed by quote?
if not i then error('unmatched "') end
local f = string.sub(s, fieldstart+1, i-1)
table.insert(t, (string.gsub(f, '""', '"')))
fieldstart = string.find(s, separator, i) + 1
else
local nexti = string.find(s, separator, fieldstart)
table.insert(t, string.sub(s, fieldstart, nexti-1))
fieldstart = nexti + 1
end
until fieldstart > string.len(s)
return t
end
----------------------------------------------------------------------
-- create class Csv
local Csv = torch.class("csvigo.File")
-- initializer
function Csv:__init(filepath, mode, separator, nan_as_missing)
local msg = nil
self.filepath = filepath
self.file, msg = io.open(filepath, mode)
self.separator = separator or ','
self.nan_as_missing = nan_as_missing or false
if not self.file then error(msg) end
end
-- close underlying file
function Csv:close()
self.file:close()
end
-- return iterator that reads all the remaining lines
function Csv:lines()
return self.file:lines()
end
-- return next record from the csv file
-- return nill if at end of file
function Csv:read()
local line = self.file:read()
if not line then return nil end
-- strip CR line endings
line = line:gsub('\r', '')
return fromcsv(line, self.separator)
end
function Csv:largereadall()
local ok = pcall(require, 'torch')
if not ok then
error('large mode needs the torch package')
end
local libcsvigo = require 'libcsvigo'
local ffi = require 'ffi'
local path = self.filepath
local f = torch.DiskFile(path, 'r'):binary()
f:seekEnd()
local length = f:position() - 1
f:seek(1)
local data = f:readChar(length)
f:close()
-- now that the ByteStorage is constructed,
-- one has to make a dictionary of [offset, length] pairs of the row.
-- for efficiency, do one pass to count number of rows,
-- and another pass to create a LongTensor and fill it
local lookup = libcsvigo.create_lookup(data)
local out = {}
local separator = self.separator
local function index (tbl, i)
assert(i, 'index has to be given')
assert(i > 0 and i <= lookup:size(1), "index out of bounds: " .. i)
local line = ffi.string(data:data() + lookup[i][1], lookup[i][2])
local entry = fromcsv(line, separator)
return entry
end
local function stringm (i)
assert(i, 'index has to be given')
assert(i > 0 and i <= lookup:size(1), "index out of bounds: " .. i)
return ffi.string(data:data() + lookup[i][1], lookup[i][2])
end
out.mt = {}
out.mt.__index = index
out.mt.__newindex = function (t,k,v)
error("attempt to update a read-only table", 2)
end
out.mt.__len = function (t)
return lookup:size(1)
end
out.mt.__tostring = function(t)
local s = ''
if lookup:size(1) < 30 then
for i = 1, lookup:size(1) do
s = s .. stringm(i) .. '\n'
end
else
for i = 1, 10 do
s = s .. stringm(i) .. '\n'
end
for i = 1, 10 do
s = s .. '.. .. .. .. .. .. .. .. .. \n'
end
for i = lookup:size(1)-10, lookup:size(1) do
s = s .. stringm(i) .. '\n'
end
end
return s
end
out.mt.__ipairs = function(t)
local counter = 0
function iter()
counter = counter + 1
if counter <= lookup:size(1) then
return counter, index(t, counter)
end
return nil
end
return iter, t, 0
end
out.mt.__pairs = function(t)
local counter = 0
function iter()
counter = counter + 1
if counter <= lookup:size(1) then
return counter, index(t, counter)
end
return nil
end
return iter, t, nil
end
setmetatable(out, out.mt)
-- size
-- tostring
-- iterator
-- index
-- error on newindex
return out
end
-- return all records as an array
-- each element of the array is an array of strings
-- should be faster than reading record by record
function Csv:readall(mode)
if mode == 'large' then
return self:largereadall()
end
local res = {}
while true do
local line = self.file:read("*l")
if not line then break end
-- strip CR line endings
line = line:gsub('\r', '')
local entry = fromcsv(line, self.separator)
res[#res+1] = entry
end
return res
end
-- write array of strings|numbers to the csv file followed by \n
-- convert to csv format by inserting commas and quoting where necessary
-- return nil
function Csv:write(a)
res, msg = self.file:write(tocsv(a, self.separator, self.nan_as_missing),"\n")
if res then return end
error(msg)
end
-- write all records in an array (table of tables)
function Csv:writeall(a, nan_as_missing)
for i,entry in ipairs(a) do
res, msg = self.file:write(tocsv(entry, self.separator, self.nan_as_missing),"\n")
if not res then error(msg) end
end
return true
end