unit NLPCore;
{$MODE OBJFPC}{$H+}{$RANGECHECKS ON}

{
    Part of AdvancedChatAI.
    For GNU/Linux 64 bit version.
    Version: 1.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025-2026 Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}


interface

uses
  SysUtils, Classes, StrUtils, NLPUtils;

type
  TToken = record
    Text: string;
    Lemma: string;
    PosTag: string; // часть речи
    IsStopWord: Boolean;
  end;
  
  TTokenArray = array of TToken;

  TTextProcessor = class
  private
    FStopWords: TStringArray;
    procedure LoadStopWords(const FileName: string);
  public
    constructor Create(const StopWordsFile: string = '');
    function Tokenize(const Text: string): TTokenArray;
    function Lemmatize(const Token: string): string;
    function PosTag(const Token: string): string;
    function RemoveStopWords(const Tokens: TTokenArray): TTokenArray;
  end;

implementation

constructor TTextProcessor.Create(const StopWordsFile: string);
begin
  if StopWordsFile <> '' then
    LoadStopWords(StopWordsFile);
end;

procedure TTextProcessor.LoadStopWords(const FileName: string);
var
  List: TStringList;
begin
  List := TStringList.Create;
  try
    List.LoadFromFile(FileName);
    FStopWords := SplitString(List.Text, #13#10);
  finally
    List.Free;
  end;
end;

function TTextProcessor.Tokenize(const Text: string): TTokenArray;
var
  RawTokens: TStringArray;
  I: Integer;
begin
  RawTokens := NLPUtils.Tokenize(Text);
  SetLength(Result, Length(RawTokens));
  
  for I := 0 to High(RawTokens) do
  begin
    Result[I].Text := RawTokens[I];
    Result[I].Lemma := Lemmatize(RawTokens[I]);
    Result[I].PosTag := PosTag(RawTokens[I]);
    Result[I].IsStopWord := AnsiIndexStr(LowerCase(RawTokens[I]), FStopWords) >= 0;
  end;
end;

function TTextProcessor.Lemmatize(const Token: string): string;
begin
  // TODO: Реализовать лемматизацию (можно начать с простых правил)
  Result := LowerCase(Token);
end;

function TTextProcessor.PosTag(const Token: string): string;
begin
  // TODO: Реализовать определение части речи (можно начать с простых правил)
  Result := 'UNKN';
end;

function TTextProcessor.RemoveStopWords(const Tokens: TTokenArray): TTokenArray;
var
  I, Count: Integer;
begin
  Count := 0;
  for I := 0 to High(Tokens) do
    if not Tokens[I].IsStopWord then
      Inc(Count);
      
  SetLength(Result, Count);
  Count := 0;
  
  for I := 0 to High(Tokens) do
    if not Tokens[I].IsStopWord then
    begin
      Result[Count] := Tokens[I];
      Inc(Count);
    end;
end;

end.