code :: data :: issues
©2024, tim menzies (timm@ieee.org)


How to make and test defensible decisions from incomplete knowledge, while maturing your model of the world.

About

tHis code is about less, but better, analytics where “better”= faster, cheaper, explicable, really simple to code, no complex models (mostly instance-based AI). - Using as few dependent variables as possible.. - …incrementally build models that recognize (best,rest) examples (where “best” can be defined by multiple goals).

For this code

local the,help = {},[[
min.lua : multiple-objective active learning
(c) 2024, Tim Menzies <timm@ieee.org>, BSD-2.

USAGE:
  chmod +x min.lua
  ./min.lua [OPTIONS] [ARGS]

OPTIONS:
  -all            run test suite
  -b begin  int   initial samples   = 4   -- 
  -B Break  int   max samples       = 30
  -c cut    int   items to sort     = 100
  -C Cohen  float small effect      = .35
  -e elite  int   elite sample size = 4
  -f fars   int   samples for far   = 30
  -h              show help            
  -k k      int   Bayes param       = 0
  -l leaf   float leaf size         = .5
  -m m      int   Bayes param       = 3
  -p p      int   distance power    = 2
  -r ranges int   max num of bins   = 10
  -s seed   int   random seed       = 1234567891
  -t train  str   csv file          =  ../../moot/optimize/misc/auto93.csv
  -T Top    float best set size     = .5]]

local NUM,SYM,COLS,DATA,l = {},{},{},{},{}

Create

function SYM:new(  i: int, is: str) --> SYM 
  i, is = i or 0, is or " "
  return l.new(SYM, {n=0, i=i, is=is, has={}, most=0, mode=nil}) end

function NUM:new(  i: int, is: str) --> SYM
  i, is = i or 0, is or " "
  return l.new(NUM, {n=0, i=i, is=is, mu=0, sd=0, m2=0, lo=l.big, hi=-l.big,
                     goal = is:find"-$" and 0 or 1}) end

function COLS:new(names : list[str],     all,x,y,col) --> COLS
  all,x,y = {},{},{}
  for i,is in pairs(names) do
    col = l.push(all, (is:find"^[A-Z]" and NUM or SYM):new(i,is))
    if not is:find"X$" then
      l.push(is:find"[!+-]$" and y or x, col) end end
  return l.new(COLS, {names=names, all=all, x=x, y=y}) end

function DATA:new() -->  DATA
  return l.new(DATA, {rows={}, cols=nil}) end

function DATA:clone(  rows: rows) --> DATA 
  return DATA:new():from({self.cols.names}):from(rows) end

Update

function DATA:csv(file:str)  --> DATA
  l.csv(file, function(n,row) 
              table.insert(row, n==0 and "idX" or n)
              self:add(row) end)
  return self end

function DATA:from(  rows:rows) --> DATA
  for _,row in pairs(rows or {}) do self:add(row) end
  return self end

function DATA:add(row:row) --> nil
  if   self.cols 
  then l.push(self.rows,self.cols:add(row)) 
  else self.cols=COLS:new(row) end end

function COLS:add(row: row) --> row
  for _,cols in pairs{self.x, self.y} do
    for _,col in pairs(cols) do
      col:add( row[col.i] ) end end
  return row end

function NUM:add(x: num,    d) -->  nil
  if x ~= "?" then
    self.n  = self.n + 1
    d       = x - self.mu
    self.mu = self.mu + d / self.n
    self.m2 = self.m2 + d * (x - self.mu)
    self.sd = self.n < 2 and 0 or (self.m2/(self.n - 1))^.5 
    if x > self.hi then self.hi = x end
    if x < self.lo then self.lo = x end end end  

function SYM:add(x: atom,  n) -->  nil
  if x ~= "?" then
    n           = n or 1
    self.n      = n + self.n 
    self.has[x] = n + (self.has[x] or 0) 
    if self.has[x] > self.most then
      self.most, self.mode = self.has[x], x end end end

Query

function NUM:norm(x:num) --> 0..1
  return x=="?" and x or (x - self.lo) / (self.hi - self.lo + 1/l.big) end

function NUM:pdf(x:num) --> num
  return math.exp(-.5*((x - self.mu)/self.sd)^2) / (self.sd*((2*math.pi)^0.5)) end


function NUM:cdf(x:num,     fun,z) --> num
  fun = function(z) return 1 - 0.5 * math.exp(-0.717 * z - 0.416 * z * z) end
  z   = (x - self.mu) / self.sd
  return  z>=0 and fun(z) or 1 - fun(-z) end

function NUM:discretize(x:num) --> num
  return self:cdf(x) * the.ranges // 1 end

function SYM:discretize(x:atom) --> atom
  return x end

function SYM:entropy(     fun) --> float
  fun = function(n) return n/self.n * math.log(n/self.n,2) end
  return -l.sum(self.has, fun) end

Distance

function NUM:dist(a:num, b:num) --> num
  if a=="?" and b=="?" then return 1 end
  a,b = self:norm(a), self:norm(b)
  a = a ~= "?" and a or (b < .5 and 1 or 0)
  b = b ~= "?" and b or (a < .5 and 1 or 0)
  return math.abs(a - b) end

function SYM:dist(a:atom, b:atom) --> 0,1
  return x==y and 0 or 1 end

function DATA:xDist(a:row, b:row,       fun) --> num
  fun = function(col) return col:dist(a[col.i], b[col.i])^the.p end
  return sum(self.cols.x, fun) / (#self.cols.x)^(1/the.p) end

function DATA:twoFar(repeats:int,rows,?sortp:bool,    most,a0,b0,a,b,d) --> row,row
  most = 0
  for i=1,repeats do 
    a0,b0 = l.any(rows), l.any(rows)
    d = self:xDist(a0,b0)
    if d > most then most,a,b = d,a0,b0 end end
  if sortp and self:yDist(b) < self:yDist(a) then a,b = a,b end
  return most,a,b end

function DATA:half(rows, ?sortp:bool) --> float,rows,rows,row,row
  local lefts,rights,left,right,cos,fun = {},{}
  c, left,right = self:twoFar(the.far, rows, sortp)
  cos = function(a,b) return (a^2 + c^2 - b^2) / (2*c+ 1E-32) end 
  fun = function(r) return {d   = cos(self:xDist(r,left), self:xDist(r,right)),
                            row = r} end
  for i,one in pairs(sort(map(rows, fun), lt"d")) do
    l.push(i <= #rows//2 and lefts or rights, one.row) end
  return self.xDist(left,rights[1]), lefts, rights, left, right end

-- tree, where,
function DATA:branch(rows:rows, ?stop:int,    done,label,grow) --> rows,rows
  stop = stop or #rows^the.leaf
  done = {}
  label= function(row) done[l.id(row)] = row end
  grow = function(rows,       _,lefts,__,left,right)
           if #rows <= stop then return rows end 
           _, lefts, __, left, right = self:half(rows,true)
           label(left)
           label(right)
           return grow(lefts) end
  return grow(rows, stop or #rows^the.leaf), done end

function DATA:tree(rows:rows, ?stop:int, ?fun:function,   done,label,grow)
  stop = stop or #rows^the.leaf
  done = {}
  label= function(row) done[l.id(row)] = row end
  grow = function(rows,  fun,     here,cut,lefts,rights,left,right)
           here = CLUSTER(self:clone(rows), fun)
           if #rows <= 2*stop then return here end
           cut, lefts, rights, left, right = self:half(rows,true)
           label(left)
           label(right)
           here.lefts  = grow(lefts, function(r) return self:xDist(r,left) < cut end)
           here.rights = grow(rights,function(r) return self:xDist(r,left) > cut end)
           return here end
  return grow(rows), done end

Goals

function DATA:yDist(row:row,     fun) --> 0..1
  fun = function(col) return math.abs(col.goal - col:norm(row[col.i])) end
  return l.max(self.cols.y, fun) end

function DATA:shuffle() --> DATA
  self.rows = l.shuffle(self.rows)
    return self end

function DATA:sort(    fun) --> DATA
  fun = function(row) return self:yDist(row) end
  self.rows = l.sort(self.rows, function(a,b) return fun(a) < fun(b) end)
  return self end

function DATA:bestRest(top,      best,rest) --> DATA,DATA
  self:sort()
  best,rest = self:clone(), self:clone()
  for i,row in pairs(self:sort().rows) do
    (i <= (#self.rows)^(top or the.Top) and best or rest):add(row) end
  return best,rest end

Bayes

function SYM:like(x:atom, prior:num) --> num
  return ((self.has[x] or 0) + the.m*prior)/(self.n +the.m) end

function NUM:like(x:num,...) --> num
  return self.sd==0 and  (x==self.mu and 1 or 1E-32) or math.min(1,self:pdf(x)) end

function DATA:like(row:row, n:int, nClasses:int) --> num
  local col,prior,out,v,inc
  prior = (#self.rows + the.k) / (n + the.k * nClasses)
  out   = math.log(prior)
  for _,col in pairs(self.cols.x) do
    v = row[col.i]
    if v ~= "?" then
      inc = col:like(v,prior)
      if inc > 0 then out = out + math.log(inc) end end end
  return out end

function DATA:acquire(score:function, rows:rows) --> row
  local todo,done,top
  todo, done = {},{}
  for _,rows in pairs{rows or {}, self.rows} do
    for _,row in pairs(rows) do
      l.push(#done < the.begin and done or todo, row) end end
  while #done < the.Break do
    top, todo = self:guess(todo, done, score or function(B,R) return B-R end)
    l.push(done, top) 
    done = self:clone(done):sort().rows end
  return done end

function DATA:guess(todo, done, score) --> row
  local out,cut,best,rest,fun,top,j,k = {}, math.min(the.cut,#todo)/#todo
  best,rest = self:clone(done):bestRest()
  fun = function(i,t,    s) 
          s = math.random()>cut and -big or score(best:like(t,#done,2), rest:like(t,#done,2))
          return {s,t} end
  for i,t in pairs(l.sort(l.maps(todo,fun), l.gt(1))) do 
    if i==1 then top=t[2] else l.push(out, t[2]) end end
  return top, out end

Contrasts

local CONTRAST={}

function CONTRAST:new(bins,goal,i,is,lo,hi,B,R)
  return l.new(CONTRAST, {bins=bins, goal=goal,  i=i, is=is, lo=lo, hi=hi, 
                          n=0, bests=0, rests=0, B=B, R=R}) end

function CONTRAST:__tostring() return "12" end
function CONTRAST:add(x,y)
  if x < self.lo then self.lo=x end
  if x > self.hi then self.hi=x end 
  self.n = self.n + 1
  if y==self.goal then self.bests = self.bests+1 else self.rests = self.rests+1 end end

function CONTRAST:entropy(     p1,p2)
  p1, p2 = self.bests/self.n, self.rests/self.n
  return -p1*math.log(p1,2) - p2*math.log(p2,2) end 

function CONTRAST:score(    b,r)
  b,r = self.bests/self.B,self.rests/self.R
  return b^2/(b + r + 1E-32) end

function CONTRAST:combined(other,dull,small,      k,e0,e1,e2)
  k = self:combine(other)
  if math.abs(self.lo  - other.lo) < dull then return k end
  if self.n < small or other.n < small    then return k end
  e0, e1, e2 = k:entropy(), self:entropy(), other:entropy()
  if e0 <= (e1*self.n + e2*other.n) / k.n then return k end end

function CONTRAST.combine(i,j,      k,lo,hi)
  lo = math.min(i.lo,j.lo)
  hi = math.max(i.hi,j.hi)
  k = CONTRAST:new(i.bins, i.goal, i.i, i.is, lo, hi,i.B,i.R)
  for _,bin in pairs(j.bins) do k.bins[bin]=bin end
  k.n     = i.n + j.n
  k.bests = i.bests + j.bests
  k.rests = i.rests + j.rests
  return k end 

function DATA:contrasts(other,both,      out)
  out = {}
  for i,col in pairs(both.cols.x) do
    for _,contrast in pairs(self:contrasts4col(col,other)) do 
        l.push(out, contrast) end end 
  return   l.sort(out, l.up(function(c) return c:score() end)) end

function DATA:contrasts4col(col,other,      x,b,out,index)
  out, index = {}, {}
  for klass,rows in pairs{best=self.rows, rest=other.rows} do
    for _,row in pairs(rows) do
      x = row[col.i]
      if x ~= "?" then
        b = col:discretize(x)
        index[b] = index[b] or 
        l.push(out,CONTRAST:new({b=b},"best",col.i,col.is,x,x,#self.rows,#other.rows))
        index[b]:add(x,klass) end end end
  return col:contrastsCombined(l.sort(out,l.lt"lo"), col.n / the.ranges) end

function SYM:contrastsCombined(contrasts,_) return contrasts end

function NUM:contrastsCombined(contrasts,small,    t,new,dull)
  dull = self.sd * the.Cohen
  t={contrasts[1]} 
  for i,contrast in pairs(contrasts) do
    if i > 1 then
      new = contrast:combined(t[#t], dull,small) 
      if new then t[#t] = new else l.push(t,contrast) end end end
  return t end

Lib

l.big = 1E32          --> num
l.pop = table.remove  --> any
l.fmt = string.format --> str

function l.id(t) return t[#t] end

function l.map(t:list,fun:function,     u) --> list
  u={}; for _,v in pairs(t) do u[1+#u] = fun(v)  end; return u end

function l.maps(t:list,fun:function,    u) --> list
  u={}; for k,v in pairs(t) do u[1+#u]=fun(k,v)  end; return u end

l.filter  = l.map
l.filters = l.maps

function l.sum(t:list,?fun:function,      out) --> num
  out,fun = 0,fun or function(x) return x end
  l.map(t,function(x) out = out + fun(x) end)
  return out end

function l.max(t:list[X], ?fun:function,      most) --> X
  most,fun = -l.big, fun or function(x) return x end
  l.map(t, function(x) local tmp=fun(x); if tmp > most then most=tmp end end)
  return most end
 
function l.new(klass:table, obj:table) --> obj 
  klass.__index    = klass
  klass.__tostring = klass.__tostring or l.o
  return setmetatable(obj,klass) end

function l.push(t:list, x:any) --> list
  t[1+#t]=x; return x end

function l.sort(t:list, ?fun:function) --> list
  table.sort(t,fun); return t end

function l.median(t:list) --> list
  return l.sort(t)[.5*#t//1] end

function l.lt(key:str) -->  function
  return function(a,b) return a[key] < b[key] end end

function l.gt(key:str) --> function
  return function(a,b) return a[key] > b[key] end end

function l.up(fun:function) --> function
  return function(a,b) return fun(a) > fun(b) end end

function l.down(fun:function) --> function
  return function(a,b) return fun(a) < fun(b) end end

function l.keys(t:list,    u) --> list
  u={}; for k,_ in pairs(t) do l.push(u,k) end return l.sort(u) end   

function l.any(t:list) --> any
  return t[math.random(#t)] end

function l.shuffle(t:list,    j) --> list
  for i = #t, 2, -1 do j = math.random(i); t[i], t[j] = t[j], t[i] end
  return t end

function l.coerce(s:str,     fun) --> atom
  fun = function(s) return s=="true" and true or s ~= "false" and s end
  return math.tointeger(s) or tonumber(s) or fun(l.trim(s)) end

function l.csv(file:str, fun:function,      src,s,cells,n) --> nil
  function cells(s,    t)
    t={}; for s1 in s:gmatch"([^,]+)" do l.push(t,l.coerce(s1)) end; return t end
  src = io.input(file)
  n   = -1
  while true do
    s = io.read()
    if s then n=n+1; fun(n,cells(s)) else return io.close(src) end end end

function l.trim( s:str ) --> str
  return s:match"^%s*(.-)%s*$" end

function l.o(x:any,     f,g) --> str
  if type(x) == "number" then return l.fmt("%g",x) end
  if type(x) ~= "table"  then return tostring(x)   end
  f=function(x)   return l.o(x) end
  g=function(k,v) return l.o(k):find"^_" and nil or l.fmt(":%s %s",k,l.o(x[k])) end 
  return "{" .. table.concat(#x>0 and l.map(x,f) or l.sort(l.maps(x,g))," ").."}" end

function l.oo(x:any) --> nil
  print(l.o(x)) end

function l.yellow(s) return "\27[33m" .. s .. "\27[0m" end
function l.green(s)  return "\27[32m" .. s .. "\27[0m" end
function l.red(s)    return "\27[31m" .. s .. "\27[0m" end

Main

local go = {}

function go.h(_) print("\n" ..help) end

function go.the(_) l.oo(the) end

function go.all(_,     status,msg,fails,todos) 
  todos, fails = "sort csv data bayes cheb acq", 0
  for x in todos:gmatch"([^ ]+)" do
    print(l.yellow(x))
    math.randomseed(the.seed)
    status,msg = xpcall(go[x], debug.traceback, _)
    if status == false then 
      print(l.red("fail in "..x.." :"..msg)); fails = fails + 1
    else print(l.green("pass")) end end
  os.exit(fails) end 

function go.cut(x) the.cut = l.coerce(x) end  
function go.train(x) the.train = x end  

function go.seed(x) the.seed=l.coerce(x); math.randomseed(the.seed) end

function go.sum(_,  t)
  fun = function(x) return -x end
  assert(-10 == l.sum({1,2,3,4}, fun), "gosummax fail") end

function go.max(_,  fun)
  fun = function(x) return -x end
  assert(-10 == l.max({1,3,10,-10,5,5},fun), "go.max fail") end

function go.sort(_,     t)
  t = l.sort({10,1,2,3,1,4,1,1,2,4,2,1}, function(a,b) return a>b end)
  assert(t[1]==10, "wrong sort") end

function go.csv(_,     fun) 
  fun = function(n,t) if (n % 60) == 0 then print(n, l.o(t)) end end
  l.csv(the.train, fun) end

function go.data(_,      d)
  d = DATA:new():csv(the.train):shuffle():sort()
  for n,row in pairs(d.rows) do 
    if n==1 or (n%30)==0 then print(n,l.o(row)) end end
  print""; for _,col in pairs(d.cols.y) do l.oo(col) end end

function go.bayes(_,      d,fun)
  d   = DATA:new():csv(the.train) 
  fun = function(t) return d:like(t,1000,2) end
  for n,t in pairs(l.sort(d.rows, l.down(fun))) do
    if n==1 or n==#d.rows or (n%30)==0 then print(n, t[#t], fun(t)) end end end

function go.cheb(_,      d,num)
  d   = DATA:new():csv(the.train):sort() 
  num = NUM:new()
  for _,t in pairs(d.rows) do num:add(d:yDist(t)) end
  mu,sd = num.mu, num.sd
  assert(0.69 < mu and mu < 0.7 and 0.16 < sd and sd < 0.17,"bad cheb") end

function go.acq(_,      d,toBe,t,asIs,repeats,start)
  d = DATA:new():csv(the.train) 
  asIs,toBe = {},{}
  for _,t in pairs(d.rows) do l.push(asIs, d:yDist(t)) end
  repeats = 20
  start = os.clock()
  for i=1,repeats do l.push(toBe, d:yDist(d:shuffle():acquire()[1])) end
  l.oo{secs = (os.clock() - start)/repeats, asIs=l.median(asIs), toBe=l.median(toBe)} 
end

function go.br(_,     both,best,rest)
  both = DATA:new():csv(the.train)
  best,rest = both:bestRest(.5)
  for i,row in pairs(best.rows) do 
    print(i,l.fmt("%.2f",both:yDist(row)),l.o(row))  end
  print""
  for i,row in pairs(rest.rows) do 
    if i==1 or i%30==0 then 
      print(i+#best.rows,l.fmt("%.2f",both:yDist(row)),l.o(row)) end end end 

function go.contrasts(_,     both,best,rest)
  both = DATA:new():csv(the.train)
  best,rest = both:bestRest(.5)
  for i,c in pairs(best:contrasts(rest,both)) do 
    print(i, c, c:score(#best.rows, #rest.rows)) end end

function go.push(_) os.execute("git commit -am saving; git push; git status") end
function go.pdf(_)  os.execute("make -B ~/tmp/min.pdf; open ~/tmp/min.pdf") end
function go.doc(_)  os.execute(
  "pycco -d ~/tmp min.lua; echo 'p {text-align:right;}' >> ~/tmp/pycco.css") end

Start

help:gsub("\n%s+-%S%s(%S+)[^=]+=%s+(%S+)", function(k,v) the[k]= l.coerce(v) end)
math.randomseed(the.seed)

if arg[0]:find"noml.lua" then
  for i,s in pairs(arg) do 
    s = s:sub(2)
    if go[s] then go[s]( arg[i+1] ) end end end

return {NUM=NUM,SYM=SYM,DATA=DATA,lib=l,the=the,help=help} 

  1. https://github.com/timm/noml/blob/main/etc/md2lua.awk↩︎