local l=require"lib"
local any, sort, two, shuffle, norm, push, csv, min, pick, o, new =
l.any,l.sort,l.two,l.shuffle,l.norm,l.push,l.csv,l.min,l.pick,l.o,l.new
local lt,map = l.lt,l.map
local the,go={},{}
function go.h(_) print(string.format([[
kseed.lua : multi-objective optimization via kmeans++ initialization.
(c) 2024 Tim Menzies <timm@ieee.org>, MIT license.
USAGE:
lua kseed.lua [OPTIONS] [DEMO]
OPTIONS:
-d file csv file of data (default: %s)
-p int coefficient of distance (default: %s)
-r int random number seed (default: %s)
-s int #samples searched for each new centroid (default: %s)
DEMO:
-header header generation
-csv [file] csv reader
-data [file] loading rows
-x [file] sorting rows by x values
-y [file] sorting rows by y values
-around [file] find very good examples via kmeans++ initialization
]], the.data, the.p, the.rseed, the.samples)) end
Config options
the= {p= 2,
data= "../../moot/optimize/misc/auto93.csv",
rseed= 1234567891,
samples= 32}
local Big=1E32
Handlers for command line arguments -d, -p, -s, -r
function go.p(x) the.p = x+0 end
function go.d(x) the.data = x end
function go.s(x) the.samples = x+0 end
function go.r(x) the.rseed = x+0; math.randomseed(the.rseed); end
This code is so simple, it only needs summaries of numeric columns
local Num, Data = {},{}
Summarize numeric columns
function Num:new(name)
return new(Num,{lo= Big, -- smallest number seen in a column
hi= -Big, -- largest number seen in a column
utopia = name:find"-$" and 0 or 1 -- (min|max)imize = 0,1
}) end
Holds the rows and column summaries.
function Data:new(src)
self = new(Data, {
num ={}, -- num[i] : list[Num]
x= {}, -- independent columns (keyed by column number)
y= {}, -- dependent columns (keyed by column number)
rows={}, -- set of rows
xs=0,ys=0
})
for k,s in pairs(src()) do sekf:newHeader(k,s) end
return self:adds(src) end
function Data:newHeader(k,s)
if s:find"^[A-Z]" then self.num[k] = Num:new(s) end
if not s:find"X$" then
if s:find"[!+-]$" then self.y[k] = self.num[k] else self.x[k] = k end
if self.x[k] then self.xs=1+self.xs end
if self.y[k] then self.ys=1+self.ys end end end
Update knowledge of numeric columns. Ensures x
is a numeric.
function Num:add(x)
if x=="?" then return x end
x = x + 0
self.lo = math.min(x, self.lo)
self.hi = math.max(x, self.hi)
return x end
Updates the rows and column summaries.
function Data:add(t)
for k,num in pairs(self.num) do t[k] = num:add(t[k]) end
push(self.rows, t)
return self end
Builds a new Data
from a csv file.
function Data:adds(src)
for t in src do self:add(t) end
return self end
Normalize a number 0..1
function Num:norm(x)
return x=="?" and x or (x - self.lo) / (self.hi - self.lo) end
IB1’s distance between independent variables in row t1
, t2
. See section 2.4 of
https://link.springer.com/article/10.1007/BF00153759
function Data:xdist(t1,t2, n,d,DIST)
DIST= function(num, a,b)
if (a=="?" and b=="?") then return 1 end -- if all unknown, assume max
if not num then return a == b and 0 or 1 end -- non-numerics are easy
a,b = num:norm(a), num:norm(b)
a = a ~= "?" and a or (b<0.5 and 1 or 0) -- when in doubt, assume max
b = b ~= "?" and b or (a<0.5 and 1 or 0) -- when in doubt, assume max
return math.abs(a - b) end
d,n=0,0
for k,col in pairs(self.x) do
n = n + 1
d = d + math.abs(DIST(self.num[k], t1[k], t2[k])) ^ the.p end
return (d/n) ^ (1/the.p) end
Return distance to utopia of the dependent variables.
function Data:ydist(t, n,d)
d,n=0,0
for k,num in pairs(self.y) do
n = n + 1
d = d + math.abs(num:norm(t[k]) - num.utopia) ^ the.p end
return (d/n) ^ (1/the.p) end
kmeans++ initialization. New centroids are distance^2 from existing ones.
function Data:around(k, rows, t,out,r1,r2,u)
rows = rows or self.rows
out = {any(rows)}
for _ = 2,k do
u={}
for _ = 1,math.min(the.samples, #rows) do
r1 = any(rows)
r2 = min(out, function(ru) return self:xdist(r1,ru) end) -- who ru closest 2?
if r1 and r2 then
u[r1]= self:xdist(r1,r2)^2 end-- how close are you
end
push(out, pick(u)) -- stochastically pick one item
end
return out end
local Sample=require"stats"
e.g. command line option lua kseed.lua -header
calls test case eg.header(_)
.
function go.header(_, data)
data = Data:new(l.items({{"name","Age","Shoesize-"}}))
print(o(data)) end
function go.csv(file, data,k)
k=0
for row in csv(file or the.data) do
k=k+1
if k==1 or k%30==0 then print(o(row)) end end end
function go.data(file, data) data= Data:new(csv(file or the.data))
print(#data.rows, o(data.y)) end
function go.xs(file, data,X)
data= Data:new(csv(file or the.data))
X = function(row) return data:xdist(data.rows[1], row) end
XX= function(a,b) return X(a) < X(b) end
for k,row in pairs(sort(data.rows,XX)) do
if k==1 or k% 30==0 then print(o(row), X(row)) end end end
function go.ys(file, data,Y)
data= Data:new(csv(file or the.data))
Y = function(row) return data:ydist(row) end
YY= function(a,b) return Y(a) < Y(b) end
for k,row in pairs(sort(data.rows,YY)) do
if k==1 or k% 30==0 then print(o(row), Y(row)) end end end
function go.around(file, data)
data= Data:new(csv(file or the.data))
Y = function(row) return data:ydist(row) end
for _=1,20 do
shuffle(data.rows)
print(Y(sort(data:around(20),two(Y))[1])) end end
function go.compare(file, b4,copy,repeats,data,all,first,want,rand,u,report)
SORTER=function(a,b)
return a._meta.mu < b._meta.mu or
(a._meta.mu == b._meta.mu and a.txt < b.txt) end
G = function(x) return string.format("%.2f",x) end
G0 = function(x) return 100*x//1 end
repeats=50
data= Data:new(csv(file or the.data))
Y = function(row) return data:ydist(row) end
b4=Sample:new(0)
all={b4}
copy={}
copy[0]=b4
for _,r in pairs(data.rows) do all[1]:add(Y(r)) end
math.randomseed(1)
for _,k in pairs{15,20,25,30,40,80,160} do
push(all,Sample:new(k))
copy[k]=all[#all]
for _=1,repeats do
shuffle(data.rows)
all[#all]:add(sort(map(data:around(k),Y))[1]) end end
first = sort(Sample.merges(sort(all,lt"mu"),b4.sd*0.35),SORTER)[1]
want = first.txt
rand=Sample:new(-1)
push(all,rand)
copy[-1] = rand
for _=1,repeats do
shuffle(data.rows)
u={}; for j,row in pairs(data.rows) do if j > want then break else push(u,Y(row)) end end
rand:add(sort(u)[1]) end
all= sort(Sample.merges(sort(all,lt"mu"),b4.sd*0.35),SORTER)
report = {G0((b4.mu - first._meta.mu)/(b4.mu - b4.lo)), --1 = delta
#data.rows, -- 2 = nrows
data.xs, -- 3 = xs
data.ys, -- 4 = ys
G(b4.lo)} -- 5 = lo
for _,k in pairs{0, --6 =
-1,15,20,25,30,40,80,160} do
push(report, string.format("%.2f%s",copy[k].mu, copy[k]._meta.rank)) end
push(report,(file or the.data):gsub("^.*/",""))
print(table.concat(report,", ")) end
function go.all(_)
l.run({"header","csv","data","xs","ys","around"}, go, the.seed) end
math.randomseed(the.rseed)
if arg[0]:find"kseed" then
for k,v in pairs(arg) do
if go[v:sub(2)] then go[v:sub(2)](arg[k+1]) end end end
return {the=the, Data=Data, Num=Num}