unit ucs4unit;

{
    UCS4 unit.
    For GNU/Linux 64 bit version.
    Version: 2.
    Written on FreePascal (https://freepascal.org/).
    Copyright (C) 2025  Artyomov Alexander
    Used https://chat.deepseek.com/
    http://self-made-free.ru/
    aralni@mail.ru

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
}

{$MODE OBJFPC}{$H+}
{$RANGECHECKS ON}
{$OPTIMIZATION LEVEL3}
{$INLINE ON}

interface

uses SysUtils, LazUTF8, Math;

type
  ucs4 = packed object
  public
    FLength: DWord;
    FCapacity: DWord;
    FData: PDWord;
    procedure Grow(MinCapacity: DWord); inline;
    function GetChar(Index: DWord): DWord; inline;
    procedure SetChar(Index: DWord; Value: Dword); inline;
    property Length: DWord read FLength;
    property Chars[Index: DWord]: DWord read GetChar write SetChar; default;
    procedure Init; inline;
    procedure Init(l:DWord); inline;
    procedure Clear; inline;
    procedure Reverse;
    function IsRTL: Boolean;
    function Concat(const S: ucs4): ucs4;
    procedure FromUTF8(const S: string);
    function ToUTF8: string;
    procedure Resize(NewLength: DWord);
    procedure Append(Char: DWord); overload;
    procedure Append(const S: ucs4); overload;
    procedure Assign(const S: ucs4);
    procedure Insert(Pos: DWord; const S: ucs4);
    procedure Delete(Pos, Count: DWord);
    procedure PadLeft(NewLen: DWord; Char: DWord = $0020);
    procedure PadRight(NewLen: DWord; Char: DWord = $0020);
    function Compare(const S: ucs4): Integer;
    function ICompare(const S: ucs4): Integer;
    function Equals(const S: ucs4): Boolean;
    function StartsWith(const S: ucs4): Boolean;
    function EndsWith(const S: ucs4): Boolean;
    function Contains(const S: ucs4): Boolean;
    function Hash: QWord;
    function TrimLeft: ucs4;
    function TrimRight: ucs4;
    procedure ReplaceChar(OldChar, NewChar: DWord);
    procedure Up;
    procedure Down;
  end;
  PUCS4 = ^ucs4;
  TUC4Array = array of ucs4;
  TUC4Matrix = array of TUC4Array;

ucs4ext = packed object(ucs4)
public
function SubString(StartPos, Len: DWord): ucs4;
function IndexOf(const SubStr: ucs4; StartPos: DWord = 0): Integer;
function LastIndexOf(const SubStr: ucs4): Integer;
function Replace(const OldPattern, NewPattern: ucs4): ucs4;
function Split(Delimiter: DWord): TUC4Array;
function Trim: ucs4;
function ToLower: ucs4;
function ToUpper: ucs4;
end;
TUC4ExtArray = array of ucs4ext;
TUC4ExtMatrix = array of TUC4ExtArray;

function IsValidUnicode(CodePoint: DWord): Boolean;

implementation

uses ucs4functionsunit;

procedure ucs4.Init;
begin
  FData := nil;
  FLength := 0;
  FCapacity := 0;
end;

procedure ucs4.Init(l:DWord);
begin
  FLength := l;
  FCapacity := l;
  FData := GetMem(l*SizeOf(DWord));
end;


procedure ucs4.Grow(MinCapacity: DWord);
var
  NewCapacity: DWord;
begin
  if FCapacity = 0 then
    NewCapacity := 8
  else
    NewCapacity := FCapacity * 2;

  if NewCapacity < MinCapacity then
    NewCapacity := MinCapacity;

  ReallocMem(FData, NewCapacity * SizeOf(DWord));
  FCapacity := NewCapacity;
end;

function ucs4.GetChar(Index: DWord): DWord; inline;
begin
Exit(FData[Index]);
end;

procedure ucs4.SetChar(Index: DWord; Value: Dword); inline;
begin
FData[Index] := Value;
end;

procedure ucs4.Clear;
begin
  if FData <> nil then
  begin
    FreeMem(FData);
    FData := nil;
  end;
  FLength := 0;
  FCapacity := 0;
end;

procedure ucs4.Reverse;
var
  I: DWord;
  Tmp: DWord;
  P1, P2: PDWord;
begin
  if FLength <= 1 then Exit;

  P1 := @FData[0];
  P2 := @FData[FLength-1];

  while P1 < P2 do
  begin
    Tmp := P1^;
    P1^ := P2^;
    P2^ := Tmp;
    Inc(P1);
    Dec(P2);
  end;
end;

function ucs4.IsRTL: Boolean;
var
  I: DWord;
begin
  for I := 0 to FLength - 1 do
    if (FData[I] >= $0590) and (FData[I] <= $08FF) then
      Exit(True);
  Result := False;
end;

function ucs4.Concat(const S: ucs4): ucs4;
begin Result:=default(ucs4);
  Result.Init;
  if Self.FLength + S.FLength = 0 then Exit;

  GetMem(Result.FData, (Self.FLength + S.FLength) * SizeOf(DWord));
  Result.FCapacity := Self.FLength + S.FLength;
  Result.FLength := Result.FCapacity;

  if Self.FLength > 0 then
    Move(Self.FData^, Result.FData^, Self.FLength * SizeOf(DWord));

  if S.FLength > 0 then
    Move(S.FData^, Result.FData[Self.FLength], S.FLength * SizeOf(DWord));
end;

procedure ucs4.FromUTF8(const S: string);
var
  UTF8Ptr: PChar;
  CharLen: Integer;
  Count, Pos: DWord;
begin
  Clear;
  if S = '' then Exit;

  // Первый проход - подсчет символов
  Count := 0;
  UTF8Ptr := PChar(S);
  while UTF8Ptr^ <> #0 do
  begin
    UTF8CodepointToUnicode(UTF8Ptr, CharLen);
    Inc(UTF8Ptr, CharLen);
    Inc(Count);
  end;

  // Выделение памяти
  if Count > FCapacity then
    Grow(Count);
  FLength := Count;

  // Второй проход - заполнение
  UTF8Ptr := PChar(S);
  Pos := 0;
  while UTF8Ptr^ <> #0 do
  begin
    FData[Pos] := UTF8CodepointToUnicode(UTF8Ptr, CharLen);
    Inc(UTF8Ptr, CharLen);
    Inc(Pos);
  end;
end;

function ucs4.ToUTF8: string;
var
  I, Len: Integer;
  P: PChar;
begin
  if FLength = 0 then Exit('');

  // Максимально возможный размер (4 байта на символ)
  SetLength(Result, FLength * 4);
  P := PChar(Result);

  for I := 0 to FLength - 1 do
    Inc(P, UnicodeToUTF8(FData[I], P));

  // Корректируем длину под реальный размер
  SetLength(Result, P - PChar(Result));
end;

function ucs4ext.SubString(StartPos, Len: DWord): ucs4;
begin Result:=default(ucs4);
  if StartPos >= FLength then
  begin
    Result.Init;
    Exit;
  end;

  if StartPos + Len > FLength then
    Len := FLength - StartPos;

  Result.Init(Len);
  Move(FData[StartPos], Result.FData^, Len * SizeOf(DWord));
end;

function ucs4ext.IndexOf(const SubStr: ucs4; StartPos: DWord = 0): Integer;
var
  i, j: Integer;
  found: Boolean;
begin
  if (SubStr.Length = 0) or (FLength = 0) or (SubStr.Length > FLength) or (StartPos >= FLength) then
    Exit(-1);

  for i := StartPos to FLength - SubStr.Length do
  begin
    found := True;
    for j := 0 to SubStr.Length - 1 do
      if FData[i + j] <> SubStr[j] then
      begin
        found := False;
        Break;
      end;
    if found then
      Exit(i);
  end;
  Result := -1;
end;

function ucs4ext.LastIndexOf(const SubStr: ucs4): Integer;
var
  i, j: Integer;
  found: Boolean;
begin
  if (SubStr.Length = 0) or (FLength = 0) or (SubStr.Length > FLength) then
    Exit(-1);

  for i := FLength - SubStr.Length downto 0 do
  begin
    found := True;
    for j := 0 to SubStr.Length - 1 do
      if FData[i + j] <> SubStr[j] then
      begin
        found := False;
        Break;
      end;
    if found then
      Exit(i);
  end;
  Result := -1;
end;

function ucs4ext.Replace(const OldPattern, NewPattern: ucs4): ucs4;
var
  i, pos: Integer;
  temp: ucs4;
begin
  if (OldPattern.Length = 0) or (FLength = 0) then
    Exit(Self);

temp:=default(ucs4);
  temp.Init;
  i := 0;

  while i < FLength do
  begin
    pos := IndexOf(OldPattern, i);
    if pos = -1 then
    begin
      // Добавляем оставшуюся часть строки
      temp := temp.Concat(SubString(i, FLength - i));
      Break;
    end;

    // Добавляем часть до совпадения
    temp := temp.Concat(SubString(i, pos - i));
    // Добавляем замену
    temp := temp.Concat(NewPattern);
    // Перемещаем указатель
    i := pos + OldPattern.Length;
  end;

  Result := temp;
end;

function ucs4ext.Split(Delimiter: DWord): TUC4Array;
var
  i, start, count: Integer;
begin
  if FLength = 0 then
    Exit(nil);

  // Подсчет количества разделителей
  count := 0;
  for i := 0 to FLength - 1 do
    if FData[i] = Delimiter then
      Inc(count);

  SetLength(Result, count + 1);
  start := 0;
  count := 0;

  for i := 0 to FLength - 1 do
    if FData[i] = Delimiter then
    begin
      Result[count] := SubString(start, i - start);
      Inc(count);
      start := i + 1;
    end;

  // Последний сегмент
  Result[count] := SubString(start, FLength - start);
end;

function ucs4ext.Trim: ucs4;
var
  start, len: Integer;
begin
  if FLength = 0 then
    Exit(Self);

  // Находим первый не-пробельный символ
  start := 0;
  while (start < FLength) and (IsWhiteSpace(FData[start])) do
    Inc(start);

  // Находим последний не-пробельный символ
  len := FLength;
  while (len > start) and (IsWhiteSpace(FData[len - 1])) do
    Dec(len);

  len := len - start;
  if len <= 0 then
    Exit(Default(ucs4));

  Result := SubString(start, len);
end;

function ucs4ext.ToLower: ucs4;
var
  i: Integer;
begin Result:=default(ucs4);
  Result.Init(FLength);
  for i := 0 to FLength - 1 do
    Result[i] := UnicodeToLower(FData[i]);
end;

function ucs4ext.ToUpper: ucs4;
var
  i: Integer;
begin Result:=default(ucs4);
  Result.Init(FLength);
  for i := 0 to FLength - 1 do
    Result[i] := UnicodeToUpper(FData[i]);
end;

function IsValidUnicode(CodePoint: DWord): Boolean;
begin
  Result := (CodePoint <= $10FFFF) and 
           ((CodePoint < $D800) or (CodePoint > $DFFF)) and // Исключаем суррогатные пары
           (CodePoint <> $FFFD) and // Исключаем replacement character
           (CodePoint <> $FFFF) and (CodePoint <> $FFFE); // Исключаем non-characters
end;

procedure ucs4.Resize(NewLength: DWord);
var
  OldLength: DWord;
  I: DWord;
begin
  if NewLength = FLength then
    Exit;

  OldLength := FLength;

  // Если нужно увеличить – гарантируем память
  if NewLength > FCapacity then
    Grow(NewLength);

  // Если увеличили длину – добиваем пробелами (U+0020)
  if NewLength > OldLength then
  begin
    for I := OldLength to NewLength - 1 do
      FData[I] := $0020; // пробел
  end;

  // Просто меняем длину (при уменьшении физическая память не трогается)
  FLength := NewLength;
end;

procedure ucs4.Append(Char: DWord);
begin
  if FLength + 1 > FCapacity then
    Grow(FLength + 1);

  FData[FLength] := Char;
  Inc(FLength);
end;


procedure ucs4.Append(const S: ucs4);
begin
  if S.FLength = 0 then
    Exit;

  if FLength + S.FLength > FCapacity then
    Grow(FLength + S.FLength);

  Move(S.FData^, FData[FLength], S.FLength * SizeOf(DWord));

  Inc(FLength, S.FLength);
end;

procedure ucs4.Assign(const S: ucs4);
begin
  if @Self = @S then
    Exit;

  if S.FLength = 0 then
  begin
    Clear;
    Exit;
  end;

  if S.FLength > FCapacity then
    Grow(S.FLength);

  Move(S.FData^, FData^, S.FLength * SizeOf(DWord));
  FLength := S.FLength;
end;

procedure ucs4.Insert(Pos: DWord; const S: ucs4);
begin
  if S.FLength = 0 then
    Exit;

  if Pos > FLength then
    Pos := FLength;

  if FLength + S.FLength > FCapacity then
    Grow(FLength + S.FLength);

  // Сдвигаем хвост
  Move(
    FData[Pos],
    FData[Pos + S.FLength],
    (FLength - Pos) * SizeOf(DWord)
  );

  // Вставляем
  Move(S.FData^, FData[Pos], S.FLength * SizeOf(DWord));

  Inc(FLength, S.FLength);
end;

procedure ucs4.Delete(Pos, Count: DWord);
begin
  if (Pos >= FLength) or (Count = 0) then
    Exit;

  if Pos + Count > FLength then
    Count := FLength - Pos;

  Move(
    FData[Pos + Count],
    FData[Pos],
    (FLength - (Pos + Count)) * SizeOf(DWord)
  );

  Dec(FLength, Count);
end;

procedure ucs4.PadLeft(NewLen: DWord; Char: DWord = $0020);
var
  Diff: DWord;
  I: Integer;
begin
  if NewLen <= FLength then Exit;

  Diff := NewLen - FLength;

  if NewLen > FCapacity then
    Grow(NewLen);

  // Сдвигаем всё вправо
  Move(FData[0], FData[Diff], FLength * SizeOf(DWord));

  // Заполняем начало
  for I := 0 to Diff - 1 do
    FData[I] := Char;

  FLength := NewLen;
end;


procedure ucs4.PadRight(NewLen: DWord; Char: DWord = $0020);
var
  I: Integer;
begin
  if NewLen <= FLength then Exit;

  if NewLen > FCapacity then
    Grow(NewLen);

  for I := FLength to NewLen - 1 do
    FData[I] := Char;

  FLength := NewLen;
end;

function ucs4.Compare(const S: ucs4): Integer;
var
  I, MinLen: LongInt;
begin
  if @Self = @S then Exit(0);

  MinLen := FLength;
  if S.FLength < MinLen then
    MinLen := S.FLength;

  for I := 0 to MinLen - 1 do
  begin
    if FData[I] < S.FData[I] then Exit(-1);
    if FData[I] > S.FData[I] then Exit(1);
  end;

  if FLength = S.FLength then
    Result := 0
  else if FLength < S.FLength then
    Result := -1
  else
    Result := 1;
end;


function ucs4.ICompare(const S: ucs4): Integer;
var
  I, MinLen: LongInt;
  C1, C2: LongInt;
begin
  if @Self = @S then Exit(0);

  MinLen := FLength;
  if S.FLength < MinLen then
    MinLen := S.FLength;

  for I := 0 to MinLen - 1 do
  begin
    C1 := UnicodeToLower(FData[I]);
    C2 := UnicodeToLower(S.FData[I]);

    if C1 < C2 then Exit(-1);
    if C1 > C2 then Exit(1);
  end;

  if FLength = S.FLength then
    Result := 0
  else if FLength < S.FLength then
    Result := -1
  else
    Result := 1;
end;


function ucs4.Equals(const S: ucs4): Boolean;
begin
  if FLength <> S.FLength then Exit(False);
  if FLength = 0 then Exit(True);

  Result := Compare(S) = 0;
end;

function ucs4.StartsWith(const S: ucs4): Boolean;
begin
  if (S.FLength = 0) or (S.FLength > FLength) then
    Exit(False);

  Result := Compare(ucs4ext(S).SubString(0, S.FLength)) = 0;
end;


function ucs4.EndsWith(const S: ucs4): Boolean;
var
  StartPos: DWord;
begin
  if (S.FLength = 0) or (S.FLength > FLength) then
    Exit(False);

  StartPos := FLength - S.FLength;

  Result := Compare(ucs4ext(S).SubString(0, S.FLength)) = 0;  // осторожно → ниже исправим
  Result := (ucs4ext(self).IndexOf(S, StartPos) = Integer(StartPos));
end;

function ucs4.Contains(const S: ucs4): Boolean;
begin
  if S.FLength = 0 then Exit(False);

  Result := ucs4ext(self).IndexOf(S) >= 0;
end;

function ucs4.Hash: QWord;
var
  I: DWord;
begin
  Result := 1469598103934665603; // FNV offset basis
  for I := 0 to FLength - 1 do
  begin
    Result := Result xor FData[I];
    Result := Result * 1099511628211;
  end;
end;

function ucs4.TrimLeft: ucs4;
var
  StartPos: DWord;
begin
  if FLength = 0 then Exit(Self);

  StartPos := 0;
  while (StartPos < FLength) and IsWhiteSpace(FData[StartPos]) do
    Inc(StartPos);

  if StartPos = 0 then Exit(Self);

  Result := ucs4ext(self).SubString(StartPos, FLength - StartPos);
end;

function ucs4.TrimRight: ucs4;
var
  EndPos: Integer;
begin
  if FLength = 0 then Exit(Self);

  EndPos := FLength - 1;
  while (EndPos >= 0) and IsWhiteSpace(FData[EndPos]) do
    Dec(EndPos);

  if EndPos = Integer(FLength) - 1 then Exit(Self);

  Result := ucs4ext(self).SubString(0, EndPos + 1);
end;

procedure ucs4.ReplaceChar(OldChar, NewChar: DWord);
var
  I: DWord;
begin
  if OldChar = NewChar then Exit;

  for I := 0 to FLength - 1 do
    if FData[I] = OldChar then
      FData[I] := NewChar;
end;

procedure ucs4.Up;
var i: DWord;
begin
for i := 0 to FLength-1 do FData[I] := UnicodeToUpper(FData[i]);
end;

procedure ucs4.Down;
var i: DWord;
begin
for i := 0 to FLength-1 do FData[i] := UnicodeToLower(FData[I]);
end;

end.