Following on from my recent post (https://plus.

Following on from my recent post (https://plus.google.com/103246155735524926641/posts/JMzpSCFyUCD) about poor performance of dcc64, here is an MCVE:

Pascal, LUperf.dpr

program LUperf;

{$APPTYPE CONSOLE}
{$POINTERMATH ON}

uses
System.Diagnostics,
System.Win.Crtl;

function LinearIndex(i, j, d: Integer): Integer;
begin
Result := i * (d + 1) + j;
end;

procedure PascalLUDecomposeBandedReal(A, Al: PDouble; Index: PInteger; N, Bandwidth: Integer); cdecl;
var
d: Double;
i, j, k, l, iidx, kidx, Alidx: Integer;
mm: Integer;
tmp1, tmp2: Double;
begin
mm := 1+2*Bandwidth;
l := Bandwidth;
for i := 1 to Bandwidth do begin
iidx := LinearIndex(i, 0, mm);
for j := Bandwidth+2-i to mm do begin
A[iidx + j - l] := A[iidx + j];
end;
dec(l);
for j := mm-l to mm do begin
A[iidx + j] := 0.0;
end;
end;
d := 1.0;
l := Bandwidth;
for k := 1 to N do begin
kidx := LinearIndex(k, 0, mm);
tmp1 := A[kidx + 1];
i := k;
if l inc(l);
end;
for j := k+1 to l do begin
if abs(A[LinearIndex(j, 1, mm)])>abs(tmp1) then begin
tmp1 := A[LinearIndex(j, 1, mm)];
i := j;
end;
end;
Index[k] := i;
if tmp1=0.0 then begin
A[kidx + 1] := 1.0e-20;
end;
if i<>k then begin
iidx := LinearIndex(i, 0, mm);
d := -d;
for j := 1 to mm do begin
tmp1 := A[kidx + j];
A[kidx + j] := A[iidx + j];
A[iidx + j] := tmp1
end;
end;
tmp2 := 1.0/A[kidx + 1];
Alidx := k*(Bandwidth-1);
for i := k+1 to l do begin
iidx := LinearIndex(i, 0, mm);
tmp1 := A[iidx + 1]*tmp2;
Al[Alidx + i] := tmp1;
for j := 2 to mm do begin
A[iidx + j - 1] := A[iidx + j]-tmp1*A[kidx + j];
end;
A[iidx + mm] := 0.0;
end;
end;
end;

{$L LU.obj}

procedure CLUDecomposeBandedReal(A, Al: PDouble; Index: PInteger; N, Bandwidth: Integer); cdecl; external name 'LUDecomposeBanded';

type
TLUDecomposeBandedRealProc = procedure(A, Al: PDouble; Index: PInteger; N, Bandwidth: Integer); cdecl;

procedure Test(Proc: TLUDecomposeBandedRealProc);
const
N = 5000;
Bandwidth = 8;
IterationCount = 5000;
var
i, j, mm: Integer;
Input, A, Al: TArray;
Index: TArray;
Stopwatch: TStopwatch;
begin
mm := 1+2*Bandwidth;
SetLength(Input, (1+N)*(1+mm));
for i := 1 to N do begin
for j := 1 to mm do begin
Input[LinearIndex(i, j, mm)] := 0.5 + i - j;
end;
end;
SetLength(Al, (1+N)*(1+BandWidth));
SetLength(Index, 1+N);

SetLength(A, Length(Input));

Stopwatch := TStopwatch.StartNew;
for i := 1 to IterationCount do begin
Move(Pointer(Input)^, Pointer(A)^, Length(A)*SizeOf(A[0]));
Proc(PDouble(A), PDouble(Al), PInteger(Index), N, Bandwidth);
end;
Writeln(Stopwatch.ElapsedMilliseconds);
end;

begin
Test(PascalLUDecomposeBandedReal);
Test(CLUDecomposeBandedReal);
Readln;
end.

C, LU.c

// compile with cl /c /O2 LU.c

#include

int CLinearIndex(int i, int j, int d)
{
return i * (d + 1) + j;
}

void LUDecomposeBanded(double *A, double *Al, int *Index, int N, int Bandwidth)
{
double d, tmp1, tmp2;
int i, j, k, l, mm, iidx, kidx, Alidx;

mm = 1 + 2 * Bandwidth;
l = Bandwidth;
for (i = 1; i <= Bandwidth; i++)
{
iidx = CLinearIndex(i, 0, mm);
for (j = Bandwidth + 2 - i; j <= mm; j++)
{
A[iidx + j - l] = A[iidx + j];
}
l--;
for (j = mm - l; j <= mm; j++)
{
A[iidx + j] = 0.0;
}
}
d = 1.0;
l = Bandwidth;
for (k = 1; k <= N; k++)
{
kidx = CLinearIndex(k, 0, mm);
tmp1 = A[kidx + 1];
i = k;
if (l < N)
{
l++;
}
for (j = k+1; j <= l; j++)
{
if (fabs(A[CLinearIndex(j, 1, mm)]) > fabs(tmp1))
{
tmp1 = A[CLinearIndex(j, 1, mm)];
i = j;
}
}
Index[k] = i;
if (tmp1 == 0.0)
{
A[kidx + 1] = 1.0e-20;
}
if (i != k)
{
iidx = CLinearIndex(i, 0, mm);
d = -d;
for (j = 1; j <= mm; j++)
{
tmp2 = A[kidx + j];
A[kidx + j] = A[iidx + j];
A[iidx + j] = tmp2;
}
}
tmp2 = 1.0 / A[kidx + 1];
Alidx = k * (Bandwidth - 1);
for (i = k + 1; i <= l; i++)
{
iidx = CLinearIndex(i, 0, mm);
tmp1 = A[iidx + 1] * tmp2;
Al[Alidx + i] = tmp1;
for (j = 2; j <= mm; j++)
{
A[iidx + j - 1] = A[iidx + j] - tmp1 * A[kidx + j];
}
A[iidx + mm] = 0.0;
}
}
}

Output

9570
3581

I tested with XE7 and Seattle (results the same), and compiled the C code with cl 19.00.23026 for x64.

Having moved the Pascal code out of an instance method and into an unbound procedure that is passed pointers to data helped. But the C code is still 2.5 times faster.

Marco Cantù Are Embarcadero interested in improving performance of the desktop compilers?

Search This Blog

Delphi Developers Archive

Following on from my recent post (https://plus.

Comments

Post a Comment