unit DataAugmentation;
{$MODE OBJFPC}{$H+}{$CODEPAGE UTF8}

{
    Part of AdvancedChatAI.
    For GNU/Linux 64 bit version.
    Version: 1.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025-2026 Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}


interface

uses
  SysUtils, Classes, LazUTF8, ucs4unit, ucs4opunit, ucs4functionsunit, TrainerUnit;

function ReplaceSynonyms(const Text: ucs4): ucs4;
function ReplaceSynonyms(const Text: string): ucs4;
function AddNoiseToText(const Text: ucs4; NoiseLevel: Double): ucs4;
function AddNoiseToText(const Text: string; NoiseLevel: Double): ucs4;
function ShuffleWords(const Text: ucs4): ucs4;
function ShuffleWords(const Text: string): ucs4;
procedure AugmentTrainingData(var Dataset: TTrainingDataset; MultiplyFactor: Integer = 2);

implementation

const
  Synonyms: array of array of string = (
    ('привет', 'здравствуйте', 'добрый день', 'хай', 'hello'),
    ('как дела', 'как жизнь', 'как сам', 'как ты', 'how are you'),
    ('спасибо', 'благодарю', 'мерси', 'thanks', 'thank you'),
    ('пока', 'до свидания', 'прощай', 'bye', 'goodbye'),
    ('хорошо', 'отлично', 'прекрасно', 'замечательно', 'well'),
    ('плохо', 'ужасно', 'не очень', 'так себе', 'bad')
  );

function FindSynonym(const Word: string): string;
var
  i, j: Integer;
LWord:string;
begin
LWord:=UTF8LowerCase(Word);
  for i := 0 to High(Synonyms) do
  begin
    for j := 0 to High(Synonyms[i]) do
    begin
      if LWord = {UTF8LowerCase(}Synonyms[i][j]{)} then
      begin
        // Возвращаем случайный синоним из той же группы
        if Length(Synonyms[i]) > 1 then
          Exit(Synonyms[i][Random(Length(Synonyms[i]))])
        else
          Exit(Word);
      end;
    end;
  end;
  Exit(Word);
end;

function ReplaceSynonyms(const Text: string): ucs4;
var t: ucs4;
begin
t := Text;
Exit(ReplaceSynonyms(t));
end;

function ReplaceSynonyms(const Text: ucs4): ucs4;
var
  tokens: TUC4Array;
  i: Integer;
  resultStr: string;
begin
  tokens := TokenizeForNLP(Text);
  resultStr := '';
  
  for i := 0 to High(tokens) do
  begin
    if i > 0 then resultStr := resultStr + ' ';
    resultStr := resultStr + FindSynonym(tokens[i].ToUTF8);
  end;
  
  Result := resultStr;
end;

function AddNoiseToText(const Text: string; NoiseLevel: Double): ucs4;
var t: ucs4;
begin
t := Text;
Exit(AddNoiseToText(t, NoiseLevel));
end;

function AddNoiseToText(const Text: ucs4; NoiseLevel: Double): ucs4;
var
  i: Integer;
  chars: array of Char;
  resultStr: string;
begin
  resultStr := Text.ToUTF8;
  SetLength(chars, Length(resultStr));
  
  for i := 1 to Length(resultStr) do
    chars[i-1] := resultStr[i];
  
  // Добавляем случайные опечатки
  for i := 0 to High(chars) do
  begin
    if Random < NoiseLevel then
    begin
      case Random(4) of
        0: // Пропуск символа
          chars[i] := #0;
        1: // Дублирование символа
          if i < High(chars) then
          begin
            Move(chars[i], chars[i+1], (Length(chars) - i - 1) * SizeOf(Char));
            chars[i] := chars[i];
          end;
        2: // Замена на соседний символ
          chars[i] := Chr(Ord(chars[i]) + Random(3) - 1);
      end;
    end;
  end;
  
  // Собираем обратно в строку
  resultStr := '';
  for i := 0 to High(chars) do
    if chars[i] <> #0 then
      resultStr := resultStr + chars[i];
  
  Result := resultStr;
end;

function ShuffleWords(const Text: string): ucs4;
var t: ucs4;
begin
t := Text;
Exit(ShuffleWords(t));
end;

function ShuffleWords(const Text: ucs4): ucs4;
var
  tokens: TUC4Array;
  i, j: Integer;
  temp: ucs4;
begin
tokens := TokenizeForNLP(Text);
  
// Перемешиваем слова (кроме первого и последнего)
if Length(tokens) > 3 then begin
  for i := 1 to Length(tokens) - 2 do begin
    j := Random(Length(tokens) - 2) + 1;
    temp := tokens[i];
    tokens[i] := tokens[j];
    tokens[j] := temp;
  end;
end;

// Собираем обратно
Result:=default(ucs4);
Result.Init;
for i := 0 to High(tokens) do begin
  if i > 0 then Result := Result + ' ';
  Result := Result + tokens[i];
end;
end;

procedure AugmentTrainingData(var Dataset: TTrainingDataset; MultiplyFactor: Integer = 2);
var
  i, j, originalCount: Integer;
  augmentedExamples: TTrainingDataset;
begin
  if Length(Dataset) = 0 then Exit;
  
  originalCount := Length(Dataset);
  SetLength(augmentedExamples, originalCount * MultiplyFactor);
  
  for i := 0 to originalCount - 1 do
  begin
    // Копируем оригинальный пример
    augmentedExamples[i] := Dataset[i];
    
    // Создаем аугментированные версии
    for j := 1 to MultiplyFactor - 1 do
    begin
      augmentedExamples[i * MultiplyFactor + j] := Dataset[i];
      
      // Применяем различные трансформации
      case Random(3) of
        0: // Замена синонимов
          begin
            augmentedExamples[i * MultiplyFactor + j].Input := 
              ReplaceSynonyms(augmentedExamples[i * MultiplyFactor + j].Input).ToUTF8;
            augmentedExamples[i * MultiplyFactor + j].ExpectedOutput := 
              ReplaceSynonyms(augmentedExamples[i * MultiplyFactor + j].ExpectedOutput).ToUTF8;
          end;
        1: // Добавление шума
          begin
            augmentedExamples[i * MultiplyFactor + j].Input := 
              AddNoiseToText(augmentedExamples[i * MultiplyFactor + j].Input, 0.1).ToUTF8;
          end;
        2: // Перемешивание слов
          begin
            augmentedExamples[i * MultiplyFactor + j].Input := 
              ShuffleWords(augmentedExamples[i * MultiplyFactor + j].Input).ToUTF8;
          end;
      end;
    end;
  end;
  
  // Заменяем оригинальный датасет расширенным
  Dataset := augmentedExamples;
  WriteLn('Датасет расширен: ', originalCount, ' -> ', Length(Dataset), ' примеров');
end;

end.