## Copyright (C) 2021 Stefano Guidoni <ilguido@users.sf.net>
## Copyright (C) 2024 Andreas Bertsatos <abertsatos@biol.uoa.gr>
##
## This file is part of the statistics package for GNU Octave.
##
## This program is free software; you can redistribute it and/or modify it under
## the terms of the GNU General Public License as published by the Free Software
## Foundation; either version 3 of the License, or (at your option) any later
## version.
##
## This program is distributed in the hope that it will be useful, but WITHOUT
## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
## details.
##
## You should have received a copy of the GNU General Public License along with
## this program; if not, see <http://www.gnu.org/licenses/>.

classdef ClusterCriterion < handle
  ## -*- texinfo -*-
  ## @deftypefn  {statistics} {@var{obj} =} ClusterCriterion (@var{x}, @var{clust}, @var{criterion})
  ##
  ## A clustering evaluation object as created by @code{evalclusters}.
  ##
  ## @code{ClusterCriterion} is a superclass for clustering evaluation objects
  ## as created by @code{evalclusters}.
  ##
  ## List of public properties:
  ## @table @code
  ## @item @qcode{ClusteringFunction}
  ## a valid clustering funtion name or function handle.  It can be empty if
  ## the clustering solutions are passed as an input matric.
  ##
  ## @item @qcode{CriterionName}
  ## a valid criterion name to evaluate the clustering solutions.
  ##
  ## @item @qcode{CriterionValues}
  ## a vector of values as generated by the evaluation criterion for each
  ## clustering solution.
  ##
  ## @item @qcode{InspectedK}
  ## the list of proposed cluster numbers.
  ##
  ## @item @qcode{Missing}
  ## a logical vector of missing observations.  When there are @code{NaN}
  ## values in the data matrix, the corresponding observation is excluded.
  ##
  ## @item @qcode{NumObservations}
  ## the number of non-missing observations in the data matrix.
  ##
  ## @item @qcode{OptimalK}
  ## the optimal number of clusters.
  ##
  ## @item @qcode{OptimalY}
  ## the clustering solution corresponding to @code{OptimalK}.
  ##
  ## @item @qcode{X}
  ## the data matrix.
  ##
  ## @end table
  ##
  ## List of public methods:
  ## @table @code
  ## @item @qcode{addK}
  ## add a list of numbers of clusters to evaluate.
  ##
  ## @item @qcode{compact}
  ## return a compact clustering evaluation object. Not implemented
  ##
  ## @item @qcode{plot}
  ## plot the clustering evaluation values against the corresponding number of
  ## clusters.
  ##
  ## @end table
  ##
  ## @seealso{evalclusters, CalinskiHarabaszEvaluation, DaviesBouldinEvaluation,
  ## GapEvaluation, SilhouetteEvaluation}
  ## @end deftypefn

  properties (Access = public)
    ## public properties
  endproperties

  properties (GetAccess = public, SetAccess = protected)
    ClusteringFunction = "";
    CriterionName = "";
    CriterionValues = [];
    InspectedK = [];
    Missing = [];
    NumObservations = 0;
    OptimalK = 0;
    OptimalY = [];
    X = [];
  endproperties

  properties (Access = protected)
    N = 0; # number of observations
    P = 0; # number of variables
    ClusteringSolutions = []; #
    OptimalIndex = 0; # index of the optimal K
  endproperties

  methods (Access = public)

    ## constructor
    function this = ClusterCriterion (x, clust, KList)
      ## parsing input data
      if ((! ismatrix (x)) || (! isnumeric (x)))
        error ("ClusterCriterion: 'x' must be a numeric matrix");
      endif
      this.X = x;
      this.N = rows (this.X);
      this.P = columns (this.X);
      ## look for missing values
      for iter = 1 : this.N
        if (any (find (x(iter, :) == NaN)))
          this.Missing(iter) = true;
        else
          this.Missing(iter) = false;
        endif
      endfor
      ## number of usable observations
      this.NumObservations = sum (this.Missing == false);

      ## parsing the clustering algorithm
      if (ischar (clust))
        if (any (strcmpi (clust, {"kmeans", "linkage", "gmdistribution"})))
          this.ClusteringFunction = lower (clust);
        else
          error ("ClusterCriterion: unknown clustering algorithm '%s'", clust);
        endif
      elseif (isa (clust, "function_handle"))
        this.ClusteringFunction = clust;
      elseif (ismatrix (clust))
        if (isnumeric (clust)  && (length (size (clust)) == 2) && ...
            (rows (clust) == this.N))
          this.ClusteringFunction = "";
          this.ClusteringSolutions = clust(find (this.Missing == false), :);
        else
          error ("ClusterCriterion: invalid matrix of clustering solutions");
        endif
      else
        error ("ClusterCriterion: invalid argument");
      endif

      ## parsing the list of cluster sizes to inspect
      this.InspectedK = parseKList (this, KList);
    endfunction

    ## -*- texinfo -*-
    ## @deftypefn {ClusterCriterion} {@var{obj} =} addK (@var{obj}, @var{K})
    ##
    ## Add a new cluster array to inspect the ClusterCriterion object.
    ##
    ## @end deftypefn
    function this = addK (this, k)

      ## if there is not a clustering function, then we are using a predefined
      ## set of clustering solutions, hence we cannot redefine the number of
      ## solutions
      if (isempty (this.ClusteringFunction))
        warning (["ClusterCriterion.addK: cannot redefine the list of cluster"...
                  "numbers to evaluate when there is not a clustering function"]);
        return;
      endif

      ## otherwise go on
      newList = this.parseKList ([this.InspectedK k]);

      ## check if the list has changed
      if (length (newList) == length (this.InspectedK))
        warning ("ClusterCriterion.addK: the list has not changed");
      else
        ## update ClusteringSolutions and CriterionValues
        ClusteringSolutions_tmp = zeros (this.NumObservations, ...
                                    length (newList));
        CriterionValues_tmp = zeros (length (newList), 1);
        for iter = 1 : length (this.InspectedK)
          idx = find (newList == this.InspectedK(iter));

          if (! isempty (idx))
            ClusteringSolutions_tmp(:, idx) = this.ClusteringSolutions(:, iter);
            CriterionValues_tmp(idx) = this.CriterionValues(iter);
          endif
        endfor
        this.ClusteringSolutions = ClusteringSolutions_tmp;
        this.CriterionValues = CriterionValues_tmp;

        ## reset the old results
        this.OptimalK = 0;
        this.OptimalY = [];
        this.OptimalIndex = 0;

        ## update the list of cluster numbers to evaluate
        this.InspectedK = newList;
      endif
    endfunction

    ## -*- texinfo -*-
    ## @deftypefn  {ClusterCriterion} {} plot (@var{obj})
    ## @deftypefnx {ClusterCriterion} {@var{h} =} plot (@var{obj})
    ##
    ## Plot the evaluation results.
    ##
    ## Plot the CriterionValues against InspectedK from the ClusterCriterion,
    ## @var{obj}, to the current plot. It can also return a handle to the
    ## current plot.
    ##
    ## @end deftypefn
    function h = plot (this)
      yLabel = sprintf ("%s value", this.CriterionName);
      h = gca ();
      hold on;
      plot (this.InspectedK, this.CriterionValues, "bo-");
      plot (this.OptimalK, this.CriterionValues(this.OptimalIndex), "b*");
      xlabel ("number of clusters");
      ylabel (yLabel);
      hold off;
    endfunction

    ## -*- texinfo -*-
    ## @deftypefn {ClusterCriterion} {@var{obj} =} compact (@var{obj})
    ##
    ## Return a compact ClusterCriterion object (not implemented yet).
    ##
    ## @end deftypefn
    function this = compact (this)
      warning ("ClusterCriterion.compact: this method is not yet implemented.");
    endfunction

  endmethods

  methods (Access = private)
    ## check if a list of cluster sizes is correct
    function retList = parseKList (this, KList)
      if (isnumeric (KList) && isvector (KList) && all (find (KList > 0)) && ...
          all (floor (KList) == KList))
        retList = unique (KList);
      else
        error (["ClusterCriterion: the list of cluster sizes must be an " ...
                "array of positive integer numbers"]);
      endif
    endfunction
  endmethods
endclassdef

## Test input validation
%!error <ClusterCriterion: 'x' must be a numeric matrix> ...
%! ClusterCriterion ("1", "kmeans", [1:6])
%!error <ClusterCriterion: unknown clustering algorithm 'k'> ...
%! ClusterCriterion ([1, 2, 1, 3, 2, 4, 3], "k", [1:6])
%!error <ClusterCriterion: invalid matrix of clustering solutions> ...
%! ClusterCriterion ([1, 2, 1; 3, 2, 4], 1, [1:6])
%!error <ClusterCriterion: invalid argument> ...
%! ClusterCriterion ([1, 2, 1; 3, 2, 4], ones (2, 2, 2), [1:6])
