unit OpenCorporaConverter;

{
    Part of AdvancedChatAI.
    For GNU/Linux 64 bit version.
    Version: 1.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025-2026 Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}

{$MODE OBJFPC}{$H+}{$RANGECHECKS ON}

interface

uses
  SysUtils, Classes;

procedure ConvertOpenCorporaToDict(const InputFile, OutputFile: string);

implementation

procedure ConvertOpenCorporaToDict(const InputFile, OutputFile: string);
var
  Input, Output: TStringList;
  i, p: Integer;
  Line, WordForm, Lemma, Tags: string;
  ValidPos: Boolean;
begin
  Input := TStringList.Create;
  Output := TStringList.Create;
  try
    WriteLn('Loading input file...');
    Input.LoadFromFile(InputFile);
    
    WriteLn('Processing ', Input.Count, ' lines...');
    for i := 0 to Input.Count - 1 do
    begin
      Line := Input[i];
      if Line = '' then Continue;
      
      // Разбиваем строку на части
      p := Pos(#9, Line);
      if p = 0 then Continue;
      
      WordForm := Copy(Line, 1, p-1);
      Delete(Line, 1, p);
      
      p := Pos(#9, Line);
      if p = 0 then Continue;
      
      Lemma := Copy(Line, 1, p-1);
      Tags := Copy(Line, p+1, Length(Line));
      
      // Фильтруем по частям речи (NOUN, VERB, ADJ, ADV)
      ValidPos := (Pos('NOUN', Tags) = 1) or 
                 (Pos('VERB', Tags) = 1) or
                 (Pos('ADJ', Tags) = 1) or
                 (Pos('ADV', Tags) = 1);
      
      // Исключаем имена собственные и другие нежелательные формы
      if ValidPos and 
         (Pos('Name', Tags) = 0) and 
         (Pos('Surn', Tags) = 0) and
         (Pos('Patr', Tags) = 0) and
         (Pos('Geox', Tags) = 0) then
      begin
        Output.Add(WordForm + '|' + Lemma);
      end;
      
      // Прогресс
      if i mod 10000 = 0 then
        WriteLn('Processed ', i, ' of ', Input.Count, ' lines (', Lemma, ')');
    end;
    
    WriteLn('Saving results...');
    Output.SaveToFile(OutputFile);
    WriteLn('Done. Saved ', Output.Count, ' entries.');
  finally
    Input.Free;
    Output.Free;
  end;
end;

end.