unit DecisionTree;
{$MODE OBJFPC}{$H+}

{
    Part of AdvancedChatAI.
    For GNU/Linux 64 bit version.
    Version: 1.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025-2026 Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}


interface

uses
  Classes, SysUtils, Math, contnrs;

type
  TDoubleArray = array of Double;
  TDoubleMatrix = array of TDoubleArray;
  TStringArray = array of String;
  TIntArray = array of Integer;

  PTreeNode = ^TTreeNode;
  TTreeNode = record
    IsLeaf: Boolean;
    Value: String;
    SplitFeature: Integer;
    SplitValue: Double;
    Left, Right: PTreeNode;
  end;

  TDecisionTree = class
  private
    FRoot: PTreeNode;
    FMaxDepth: Integer;
    FMinSamplesSplit: Integer;
    
    function BuildTree(const X: TDoubleMatrix; const Y: TStringArray;
                     const Indices: TIntArray; Depth: Integer; 
                     UsedFeatures: TIntArray): PTreeNode;
    function CalculateGini(const Y: TStringArray; const Indices: TIntArray): Double;
    function FindBestSplit(const X: TDoubleMatrix; const Y: TStringArray;
                         const Indices: TIntArray; out BestFeature: Integer;
                         out BestValue: Double; out LeftIndices, RightIndices: TIntArray;
                         const UsedFeatures: TIntArray): Boolean;
    function GetMostCommonValue(const Y: TStringArray; const Indices: TIntArray): String;
    procedure FreeTree(var Node: PTreeNode);
    function CountUniqueValues(const Y: TStringArray; const Indices: TIntArray): Integer;
  public
    constructor Create(AMaxDepth: Integer = 5; AMinSamplesSplit: Integer = 3);
    destructor Destroy; override;
    procedure Train(const X: TDoubleMatrix; const Y: TStringArray);
    function Predict(const x: TDoubleArray): String;
  end;

implementation

type
  TStringCounter = class
    Str: String;
    Count: Integer;
    constructor Create(AStr: String);
  end;

constructor TStringCounter.Create(AStr: String);
begin
  Str := AStr;
  Count := 1;
end;

constructor TDecisionTree.Create(AMaxDepth: Integer; AMinSamplesSplit: Integer);
begin
  FRoot := nil;
  FMaxDepth := AMaxDepth;
  FMinSamplesSplit := AMinSamplesSplit;
end;

destructor TDecisionTree.Destroy;
begin
  FreeTree(FRoot);
  inherited;
end;

function TDecisionTree.CountUniqueValues(const Y: TStringArray; const Indices: TIntArray): Integer;
var
  i, j: Integer;
  UniqueValues: array of String;
  IsUnique: Boolean;
begin
  Result := 0;
  SetLength(UniqueValues, 0);
  
  for i := 0 to High(Indices) do
  begin
    IsUnique := True;
    for j := 0 to High(UniqueValues) do
    begin
      if UniqueValues[j] = Y[Indices[i]] then
      begin
        IsUnique := False;
        Break;
      end;
    end;
    
    if IsUnique then
    begin
      SetLength(UniqueValues, Length(UniqueValues) + 1);
      UniqueValues[High(UniqueValues)] := Y[Indices[i]];
    end;
  end;
  
  Result := Length(UniqueValues);
end;

function TDecisionTree.GetMostCommonValue(const Y: TStringArray; const Indices: TIntArray): String;
var
  i, j, MaxCount: Integer;
  Counters: TObjectList;
  Found: Boolean;
begin
  Counters := TObjectList.Create(True);
  try
    // Подсчёт уникальных строк
    for i := 0 to High(Indices) do
    begin
      Found := False;
      for j := 0 to Counters.Count-1 do
      begin
        if TStringCounter(Counters[j]).Str = Y[Indices[i]] then
        begin
          Inc(TStringCounter(Counters[j]).Count);
          Found := True;
          Break;
        end;
      end;
      
      if not Found then
        Counters.Add(TStringCounter.Create(Y[Indices[i]]));
    end;

    // Поиск максимального значения
    Result := '';
    MaxCount := 0;
    for i := 0 to Counters.Count-1 do
    begin
      if TStringCounter(Counters[i]).Count > MaxCount then
      begin
        MaxCount := TStringCounter(Counters[i]).Count;
        Result := TStringCounter(Counters[i]).Str;
      end;
    end;
  finally
    Counters.Free;
  end;
end;

function TDecisionTree.CalculateGini(const Y: TStringArray; const Indices: TIntArray): Double;
var
  i, j: Integer;
  Counters: TObjectList;
  Found: Boolean;
  p: Double;
begin
  Counters := TObjectList.Create(True);
  try
    // Подсчёт уникальных строк
    for i := 0 to High(Indices) do
    begin
      Found := False;
      for j := 0 to Counters.Count-1 do
      begin
        if TStringCounter(Counters[j]).Str = Y[Indices[i]] then
        begin
          Inc(TStringCounter(Counters[j]).Count);
          Found := True;
          Break;
        end;
      end;
      
      if not Found then
        Counters.Add(TStringCounter.Create(Y[Indices[i]]));
    end;

    // Расчёт индекса Джини
    Result := 1.0;
    for i := 0 to Counters.Count-1 do
    begin
      p := TStringCounter(Counters[i]).Count / Length(Indices);
      Result := Result - p*p;
    end;
  finally
    Counters.Free;
  end;
end;

function TDecisionTree.FindBestSplit(const X: TDoubleMatrix; 
                                   const Y: TStringArray;
                                   const Indices: TIntArray; 
                                   out BestFeature: Integer;
                                   out BestValue: Double;
                                   out LeftIndices, RightIndices: TIntArray;
                                   const UsedFeatures: TIntArray): Boolean;
var
  i, j, k, feat: Integer;
  currentValue, currentGain, bestGain: Double;
  currentLeft, currentRight: TIntArray;
  FeatureUsed: Boolean;
begin
  Result := False;
  BestFeature := -1;
  BestValue := 0;
  bestGain := -1;
  SetLength(LeftIndices, 0);
  SetLength(RightIndices, 0);

  // Если все значения одинаковые, не разделяем
  if CountUniqueValues(Y, Indices) <= 1 then
    Exit(False);

  // Проверяем все признаки
  for feat := 0 to Length(X[0])-1 do
  begin
    // Пропускаем уже использованные признаки
    FeatureUsed := False;
    for i := 0 to High(UsedFeatures) do
    begin
      if UsedFeatures[i] = feat then
      begin
        FeatureUsed := True;
        Break;
      end;
    end;
    if FeatureUsed then Continue;

    // Проверяем 10 случайных значений для разделения
    for k := 1 to 10 do
    begin
      i := Indices[Random(Length(Indices))];
      currentValue := X[i][feat];
      
      // Разделяем данные
      SetLength(currentLeft, 0);
      SetLength(currentRight, 0);
      for i := 0 to High(Indices) do
      begin
        if X[Indices[i]][feat] <= currentValue then
        begin
          SetLength(currentLeft, Length(currentLeft)+1);
          currentLeft[High(currentLeft)] := Indices[i];
        end
        else
        begin
          SetLength(currentRight, Length(currentRight)+1);
          currentRight[High(currentRight)] := Indices[i];
        end;
      end;
      
      // Вычисляем gain только если разделение имеет смысл
      if (Length(currentLeft) > 0) and (Length(currentRight) > 0) then
      begin
        currentGain := CalculateGini(Y, Indices) - 
                      (CalculateGini(Y, currentLeft)*Length(currentLeft) + 
                       CalculateGini(Y, currentRight)*Length(currentRight)) / Length(Indices);
        
        if currentGain > bestGain then
        begin
          bestGain := currentGain;
          BestFeature := feat;
          BestValue := currentValue;
          LeftIndices := currentLeft;
          RightIndices := currentRight;
          Result := True;
        end;
      end;
    end;
  end;
end;

function TDecisionTree.BuildTree(const X: TDoubleMatrix; 
                               const Y: TStringArray;
                               const Indices: TIntArray; 
                               Depth: Integer;
                               UsedFeatures: TIntArray): PTreeNode;
var
  BestFeature,I: Integer;
  BestValue: Double;
  LeftIndices, RightIndices: TIntArray;
  NewUsedFeatures: TIntArray;
begin
  New(Result);
  
  // Условия остановки
  if (Depth >= FMaxDepth) or (Length(Indices) <= FMinSamplesSplit) or 
     (CountUniqueValues(Y, Indices) <= 1) or
     not FindBestSplit(X, Y, Indices, BestFeature, BestValue, LeftIndices, RightIndices, UsedFeatures) then
  begin
    Result^.IsLeaf := True;
    Result^.Value := GetMostCommonValue(Y, Indices);
    Exit;
  end;
  
  // Создаем внутренний узел
  Result^.IsLeaf := False;
  Result^.SplitFeature := BestFeature;
  Result^.SplitValue := BestValue;
  
  // Добавляем использованный признак
  SetLength(NewUsedFeatures, Length(UsedFeatures)+1);
  for i := 0 to High(UsedFeatures) do
    NewUsedFeatures[i] := UsedFeatures[i];
  NewUsedFeatures[High(NewUsedFeatures)] := BestFeature;
  
  Result^.Left := BuildTree(X, Y, LeftIndices, Depth+1, NewUsedFeatures);
  Result^.Right := BuildTree(X, Y, RightIndices, Depth+1, NewUsedFeatures);
end;

procedure TDecisionTree.Train(const X: TDoubleMatrix; const Y: TStringArray);
var
  Indices: TIntArray;
  i: Integer;
  UsedFeatures: TIntArray;
begin
  FreeTree(FRoot);
  
  if (Length(X) = 0) or (Length(X) <> Length(Y)) then
    raise Exception.Create('Неверные данные для обучения');
  
  SetLength(Indices, Length(X));
  for i := 0 to High(Indices) do
    Indices[i] := i;
    
  SetLength(UsedFeatures, 0);
  FRoot := BuildTree(X, Y, Indices, 0, UsedFeatures);
end;

function TDecisionTree.Predict(const x: TDoubleArray): String;
var
  CurrentNode: PTreeNode;
begin
  if FRoot = nil then
    raise Exception.Create('Дерево не обучено');
    
  if Length(x) = 0 then
    Exit('Неверные входные данные');
    
  CurrentNode := FRoot;
  while not CurrentNode^.IsLeaf do
  begin
    if CurrentNode^.SplitFeature >= Length(x) then
      raise Exception.Create('Неверный признак в дереве');
      
    if x[CurrentNode^.SplitFeature] <= CurrentNode^.SplitValue then
      CurrentNode := CurrentNode^.Left
    else
      CurrentNode := CurrentNode^.Right;
  end;
  
  Result := CurrentNode^.Value;
end;

procedure TDecisionTree.FreeTree(var Node: PTreeNode);
begin
  if Node = nil then Exit;
  
  if not Node^.IsLeaf then
  begin
    FreeTree(Node^.Left);
    FreeTree(Node^.Right);
  end;
  
  Dispose(Node);
  Node := nil;
end;

end.