Following on from my recent post (https://plus.google.com/103246155735524926641/posts/JMzpSCFyUCD) about poor performance of dcc64, here is an MCVE:
Following on from my recent post (https://plus.google.com/103246155735524926641/posts/JMzpSCFyUCD) about poor performance of dcc64, here is an MCVE:
Pascal, LUperf.dpr
program LUperf;
{$APPTYPE CONSOLE}
{$POINTERMATH ON}
uses
System.Diagnostics,
System.Win.Crtl;
function LinearIndex(i, j, d: Integer): Integer;
begin
Result := i * (d + 1) + j;
end;
procedure PascalLUDecomposeBandedReal(A, Al: PDouble; Index: PInteger; N, Bandwidth: Integer); cdecl;
var
d: Double;
i, j, k, l, iidx, kidx, Alidx: Integer;
mm: Integer;
tmp1, tmp2: Double;
begin
mm := 1+2*Bandwidth;
l := Bandwidth;
for i := 1 to Bandwidth do begin
iidx := LinearIndex(i, 0, mm);
for j := Bandwidth+2-i to mm do begin
A[iidx + j - l] := A[iidx + j];
end;
dec(l);
for j := mm-l to mm do begin
A[iidx + j] := 0.0;
end;
end;
d := 1.0;
l := Bandwidth;
for k := 1 to N do begin
kidx := LinearIndex(k, 0, mm);
tmp1 := A[kidx + 1];
i := k;
if l inc(l);
end;
for j := k+1 to l do begin
if abs(A[LinearIndex(j, 1, mm)])>abs(tmp1) then begin
tmp1 := A[LinearIndex(j, 1, mm)];
i := j;
end;
end;
Index[k] := i;
if tmp1=0.0 then begin
A[kidx + 1] := 1.0e-20;
end;
if i<>k then begin
iidx := LinearIndex(i, 0, mm);
d := -d;
for j := 1 to mm do begin
tmp1 := A[kidx + j];
A[kidx + j] := A[iidx + j];
A[iidx + j] := tmp1
end;
end;
tmp2 := 1.0/A[kidx + 1];
Alidx := k*(Bandwidth-1);
for i := k+1 to l do begin
iidx := LinearIndex(i, 0, mm);
tmp1 := A[iidx + 1]*tmp2;
Al[Alidx + i] := tmp1;
for j := 2 to mm do begin
A[iidx + j - 1] := A[iidx + j]-tmp1*A[kidx + j];
end;
A[iidx + mm] := 0.0;
end;
end;
end;
{$L LU.obj}
procedure CLUDecomposeBandedReal(A, Al: PDouble; Index: PInteger; N, Bandwidth: Integer); cdecl; external name 'LUDecomposeBanded';
type
TLUDecomposeBandedRealProc = procedure(A, Al: PDouble; Index: PInteger; N, Bandwidth: Integer); cdecl;
procedure Test(Proc: TLUDecomposeBandedRealProc);
const
N = 5000;
Bandwidth = 8;
IterationCount = 5000;
var
i, j, mm: Integer;
Input, A, Al: TArray;
Index: TArray;
Stopwatch: TStopwatch;
begin
mm := 1+2*Bandwidth;
SetLength(Input, (1+N)*(1+mm));
for i := 1 to N do begin
for j := 1 to mm do begin
Input[LinearIndex(i, j, mm)] := 0.5 + i - j;
end;
end;
SetLength(Al, (1+N)*(1+BandWidth));
SetLength(Index, 1+N);
SetLength(A, Length(Input));
Stopwatch := TStopwatch.StartNew;
for i := 1 to IterationCount do begin
Move(Pointer(Input)^, Pointer(A)^, Length(A)*SizeOf(A[0]));
Proc(PDouble(A), PDouble(Al), PInteger(Index), N, Bandwidth);
end;
Writeln(Stopwatch.ElapsedMilliseconds);
end;
begin
Test(PascalLUDecomposeBandedReal);
Test(CLUDecomposeBandedReal);
Readln;
end.
C, LU.c
// compile with cl /c /O2 LU.c
#include
int CLinearIndex(int i, int j, int d)
{
return i * (d + 1) + j;
}
void LUDecomposeBanded(double *A, double *Al, int *Index, int N, int Bandwidth)
{
double d, tmp1, tmp2;
int i, j, k, l, mm, iidx, kidx, Alidx;
mm = 1 + 2 * Bandwidth;
l = Bandwidth;
for (i = 1; i <= Bandwidth; i++)
{
iidx = CLinearIndex(i, 0, mm);
for (j = Bandwidth + 2 - i; j <= mm; j++)
{
A[iidx + j - l] = A[iidx + j];
}
l--;
for (j = mm - l; j <= mm; j++)
{
A[iidx + j] = 0.0;
}
}
d = 1.0;
l = Bandwidth;
for (k = 1; k <= N; k++)
{
kidx = CLinearIndex(k, 0, mm);
tmp1 = A[kidx + 1];
i = k;
if (l < N)
{
l++;
}
for (j = k+1; j <= l; j++)
{
if (fabs(A[CLinearIndex(j, 1, mm)]) > fabs(tmp1))
{
tmp1 = A[CLinearIndex(j, 1, mm)];
i = j;
}
}
Index[k] = i;
if (tmp1 == 0.0)
{
A[kidx + 1] = 1.0e-20;
}
if (i != k)
{
iidx = CLinearIndex(i, 0, mm);
d = -d;
for (j = 1; j <= mm; j++)
{
tmp2 = A[kidx + j];
A[kidx + j] = A[iidx + j];
A[iidx + j] = tmp2;
}
}
tmp2 = 1.0 / A[kidx + 1];
Alidx = k * (Bandwidth - 1);
for (i = k + 1; i <= l; i++)
{
iidx = CLinearIndex(i, 0, mm);
tmp1 = A[iidx + 1] * tmp2;
Al[Alidx + i] = tmp1;
for (j = 2; j <= mm; j++)
{
A[iidx + j - 1] = A[iidx + j] - tmp1 * A[kidx + j];
}
A[iidx + mm] = 0.0;
}
}
}
Output
9570
3581
I tested with XE7 and Seattle (results the same), and compiled the C code with cl 19.00.23026 for x64.
Having moved the Pascal code out of an instance method and into an unbound procedure that is passed pointers to data helped. But the C code is still 2.5 times faster.
Marco Cantù Are Embarcadero interested in improving performance of the desktop compilers?
Pascal, LUperf.dpr
program LUperf;
{$APPTYPE CONSOLE}
{$POINTERMATH ON}
uses
System.Diagnostics,
System.Win.Crtl;
function LinearIndex(i, j, d: Integer): Integer;
begin
Result := i * (d + 1) + j;
end;
procedure PascalLUDecomposeBandedReal(A, Al: PDouble; Index: PInteger; N, Bandwidth: Integer); cdecl;
var
d: Double;
i, j, k, l, iidx, kidx, Alidx: Integer;
mm: Integer;
tmp1, tmp2: Double;
begin
mm := 1+2*Bandwidth;
l := Bandwidth;
for i := 1 to Bandwidth do begin
iidx := LinearIndex(i, 0, mm);
for j := Bandwidth+2-i to mm do begin
A[iidx + j - l] := A[iidx + j];
end;
dec(l);
for j := mm-l to mm do begin
A[iidx + j] := 0.0;
end;
end;
d := 1.0;
l := Bandwidth;
for k := 1 to N do begin
kidx := LinearIndex(k, 0, mm);
tmp1 := A[kidx + 1];
i := k;
if l
end;
for j := k+1 to l do begin
if abs(A[LinearIndex(j, 1, mm)])>abs(tmp1) then begin
tmp1 := A[LinearIndex(j, 1, mm)];
i := j;
end;
end;
Index[k] := i;
if tmp1=0.0 then begin
A[kidx + 1] := 1.0e-20;
end;
if i<>k then begin
iidx := LinearIndex(i, 0, mm);
d := -d;
for j := 1 to mm do begin
tmp1 := A[kidx + j];
A[kidx + j] := A[iidx + j];
A[iidx + j] := tmp1
end;
end;
tmp2 := 1.0/A[kidx + 1];
Alidx := k*(Bandwidth-1);
for i := k+1 to l do begin
iidx := LinearIndex(i, 0, mm);
tmp1 := A[iidx + 1]*tmp2;
Al[Alidx + i] := tmp1;
for j := 2 to mm do begin
A[iidx + j - 1] := A[iidx + j]-tmp1*A[kidx + j];
end;
A[iidx + mm] := 0.0;
end;
end;
end;
{$L LU.obj}
procedure CLUDecomposeBandedReal(A, Al: PDouble; Index: PInteger; N, Bandwidth: Integer); cdecl; external name 'LUDecomposeBanded';
type
TLUDecomposeBandedRealProc = procedure(A, Al: PDouble; Index: PInteger; N, Bandwidth: Integer); cdecl;
procedure Test(Proc: TLUDecomposeBandedRealProc);
const
N = 5000;
Bandwidth = 8;
IterationCount = 5000;
var
i, j, mm: Integer;
Input, A, Al: TArray
Index: TArray
Stopwatch: TStopwatch;
begin
mm := 1+2*Bandwidth;
SetLength(Input, (1+N)*(1+mm));
for i := 1 to N do begin
for j := 1 to mm do begin
Input[LinearIndex(i, j, mm)] := 0.5 + i - j;
end;
end;
SetLength(Al, (1+N)*(1+BandWidth));
SetLength(Index, 1+N);
SetLength(A, Length(Input));
Stopwatch := TStopwatch.StartNew;
for i := 1 to IterationCount do begin
Move(Pointer(Input)^, Pointer(A)^, Length(A)*SizeOf(A[0]));
Proc(PDouble(A), PDouble(Al), PInteger(Index), N, Bandwidth);
end;
Writeln(Stopwatch.ElapsedMilliseconds);
end;
begin
Test(PascalLUDecomposeBandedReal);
Test(CLUDecomposeBandedReal);
Readln;
end.
C, LU.c
// compile with cl /c /O2 LU.c
#include
int CLinearIndex(int i, int j, int d)
{
return i * (d + 1) + j;
}
void LUDecomposeBanded(double *A, double *Al, int *Index, int N, int Bandwidth)
{
double d, tmp1, tmp2;
int i, j, k, l, mm, iidx, kidx, Alidx;
mm = 1 + 2 * Bandwidth;
l = Bandwidth;
for (i = 1; i <= Bandwidth; i++)
{
iidx = CLinearIndex(i, 0, mm);
for (j = Bandwidth + 2 - i; j <= mm; j++)
{
A[iidx + j - l] = A[iidx + j];
}
l--;
for (j = mm - l; j <= mm; j++)
{
A[iidx + j] = 0.0;
}
}
d = 1.0;
l = Bandwidth;
for (k = 1; k <= N; k++)
{
kidx = CLinearIndex(k, 0, mm);
tmp1 = A[kidx + 1];
i = k;
if (l < N)
{
l++;
}
for (j = k+1; j <= l; j++)
{
if (fabs(A[CLinearIndex(j, 1, mm)]) > fabs(tmp1))
{
tmp1 = A[CLinearIndex(j, 1, mm)];
i = j;
}
}
Index[k] = i;
if (tmp1 == 0.0)
{
A[kidx + 1] = 1.0e-20;
}
if (i != k)
{
iidx = CLinearIndex(i, 0, mm);
d = -d;
for (j = 1; j <= mm; j++)
{
tmp2 = A[kidx + j];
A[kidx + j] = A[iidx + j];
A[iidx + j] = tmp2;
}
}
tmp2 = 1.0 / A[kidx + 1];
Alidx = k * (Bandwidth - 1);
for (i = k + 1; i <= l; i++)
{
iidx = CLinearIndex(i, 0, mm);
tmp1 = A[iidx + 1] * tmp2;
Al[Alidx + i] = tmp1;
for (j = 2; j <= mm; j++)
{
A[iidx + j - 1] = A[iidx + j] - tmp1 * A[kidx + j];
}
A[iidx + mm] = 0.0;
}
}
}
Output
9570
3581
I tested with XE7 and Seattle (results the same), and compiled the C code with cl 19.00.23026 for x64.
Having moved the Pascal code out of an instance method and into an unbound procedure that is passed pointers to data helped. But the C code is still 2.5 times faster.
Marco Cantù Are Embarcadero interested in improving performance of the desktop compilers?
+Eli Inlining the large function is surely pointless
ReplyDeleteSome simple changes cuts the time from 11.2 seconds to 6.6 seconds on my laptop:
ReplyDeletefunction LinearIndex1(i, j, d: Integer): Integer; inline;
begin
Result := i * (d + 1) + j;
end;
procedure PascalLUDecomposeBandedReal1(A, Al: PDouble; Index: PInteger; N, Bandwidth: Integer); cdecl;
var
d: Double;
i, j, k, l, iidx, kidx, Alidx : Integer;
qix,pix: ^Double;
mm: Integer;
tmp1, tmp2: Double;
begin
mm := 1+2*Bandwidth;
l := Bandwidth;
for i := 1 to Bandwidth do begin
iidx := LinearIndex1(i, 0, mm);
for j := Bandwidth+2-i to mm do begin
A[iidx + j - l] := A[iidx + j];
end;
dec(l);
pix := @A[iidx + mm - l];
for j := mm-l to mm do begin
pix^ := 0.0;
Inc(pix);
//A[iidx + j] := 0.0;
end;
end;
d := 1.0;
l := Bandwidth;
for k := 1 to N do begin
kidx := LinearIndex1(k, 0, mm);
tmp1 := A[kidx + 1];
i := k;
if l inc(l);
end;
for j := k+1 to l do begin
if abs(A[LinearIndex1(j, 1, mm)])>abs(tmp1) then begin
tmp1 := A[LinearIndex1(j, 1, mm)];
i := j;
end;
end;
Index[k] := i;
if tmp1=0.0 then begin
A[kidx + 1] := 1.0e-20;
end;
if i<>k then begin
iidx := LinearIndex1(i, 0, mm);
d := -d;
qix := @A[kidx + 1];
pix := @A[iidx + 1];
for j := 1 to mm do begin
tmp1 := qix^; //A[qix];
qix^ := pix^; //A[qix] := A[pix];
pix^ := tmp1; //A[pix] := tmp1;
Inc(qix);
Inc(pix);
end;
end;
tmp2 := 1.0/A[kidx + 1];
Alidx := k*(Bandwidth-1);
for i := k+1 to l do begin
iidx := LinearIndex1(i, 0, mm);
tmp1 := A[iidx + 1]*tmp2;
Al[Alidx + i] := tmp1;
pix := @A[iidx + 1];
qix := @A[kidx + 2];
for j := 2 to mm do begin
pix^ := pix[1] - tmp1*qix^;
Inc(pix);
Inc(qix);
//A[iidx + j - 1] := A[iidx + j]-tmp1*A[kidx + j];
end;
A[iidx + mm] := 0.0;
end;
end;
end;
My test: Delphi XE2 64 bit - 11170, FPC trunk 64 bit - 9563
ReplyDelete