/**************************************************************************
* File : winmembw.c
* Author : Matt Buchanan
* Date : 1 October 1997
*
* Summary: Memory bandwidth test for Windows NT
**************************************************************************/
#ifndef _M_IX86
#error This program uses Pentium-specific code.
#endif
#include <windows.h>
#include <stdio.h>
#include <stdlib.h>
#define DEBUG_PRINT_MAX 1
#ifdef DEBUG
#define DbgPrint(n) if (n <= DEBUG_PRINT_MAX)
printf
#else
#define DbgPrint(n) if (0) printf
#endif
#define MB
1048576
#define PIN_OVERHEAD_BYTES (1 * MB)
#define NUM_TRIALS
200
#define MIN_TRANSFER_SIZE 8
#define MAX_TRANSFER_SIZE (4 * MB)
#define MAX_NUM_CPUS
16
#define MAX_NUM_DATAPTS 25
HANDLE
hLeaveBarrierEvent[MAX_NUM_CPUS];
HANDLE
hEnterBarrierEvent[MAX_NUM_CPUS];
HANDLE
hThread[MAX_NUM_CPUS];
PBYTE
pCopyBuf;
HANDLE
hProcess;
SYSTEM_INFO
si;
DWORD
dwProcAffMask;
DWORD
dwSysAffMask;
DWORD
cbPinSize;
DWORD
dwCpuFreqHz;
PCHAR
pProgName;
/**************************************************************************
* Return floor (log2 (n)).
*/
DWORD
log2 (DWORD n)
{
DWORD count = 0;
while (n >>= 1)
count++;
return (count);
} /* End log2 () */
/**************************************************************************
* ReadPentiumCycleCount() function returns the value of the
Time Stamp
* Counter register on Pentium (and later) CPUs.
*/
#ifndef _M_IX86
#error ReadPentiumCycleCount() requires an Intel CPU.
#else
#pragma warning (disable : 4033)
/* Disable the 'function should */
#pragma warning (disable : 4035)
/* return a value' warning. */
__inline __int64
ReadPentiumCycleCount (VOID)
{
__asm
{
_emit 0fh
/* rdtsc instruction */
_emit 31h
/* edx:eax = cycle count */
}
}
#pragma warning (default : 4033)
#pragma warning (default : 4035)
#endif
/**************************************************************************
* EstimatePentiumClock() returns NT's estimate of the CPU
clock frequency
* in MHz, accurate to within about 1 MHz.
*
* Input:
* pEstFreq
* Location that will receive the
clock frequency calculation.
*
* Return:
* TRUE on success
*/
#pragma comment (lib, "advapi32")
BOOL
EstimatePentiumClock (LPDWORD pEstFreq)
{
HKEY
hKey;
DWORD
cbBuffer;
LONG
rc;
rc = RegOpenKeyEx
(
HKEY_LOCAL_MACHINE,
"Hardware\\Description\\System\\CentralProcessor\\0",
0,
KEY_READ,
&hKey
);
if (rc == ERROR_SUCCESS)
{
cbBuffer = sizeof (DWORD);
rc = RegQueryValueEx
(
hKey,
"~MHz",
NULL,
NULL,
(LPBYTE)
pEstFreq,
&cbBuffer
);
RegCloseKey (hKey);
}
return (rc == ERROR_SUCCESS);
} /* End EstimatePentiumClock() */
/**************************************************************************
* Return the number of set bits in n.
*/
DWORD
bitcount (DWORD n)
{
DWORD count;
for (count = 0; n; count++)
n &= n - 1;
return (count);
} /* End bitcount () */
/**************************************************************************
* SetMaxPinSize() attempts to pin down cbMaxRequired bytes
in physical
* memory. Administrator privileges are required if
cbMaxRequired is
* very large. Return value is the number of bytes actually
pinned.
*/
DWORD
SetMaxPinSize (DWORD cbMaxRequired)
{
DWORD
dwTrySize;
HANDLE
hProcess;
BOOL
bSuccess;
DWORD
dwLastError;
hProcess = GetCurrentProcess ();
dwTrySize = cbMaxRequired;
do
{
bSuccess = SetProcessWorkingSetSize
(
hProcess,
dwTrySize + PIN_OVERHEAD_BYTES,
dwTrySize + PIN_OVERHEAD_BYTES
);
if (!bSuccess)
{
dwLastError = GetLastError ();
switch (dwLastError)
{
case ERROR_NO_SYSTEM_RESOURCES:
dwTrySize
-= 1 * MB;
break;
case ERROR_PRIVILEGE_NOT_HELD:
printf ("%s:
Insufficient privileges for SetMaxPinSize().\n",
pProgName);
return (0);
default:
printf ("%s:
Unexpected error (%d) from "
"SetProcessWorkingSetSize().\n", pProgName, dwLastError);
return (0);
}
}
else
dwLastError = 0;
}
while ((dwTrySize / (1 * MB)) && (dwLastError ==
ERROR_NO_SYSTEM_RESOURCES));
return (dwTrySize);
} /* End SetMaxPinSize () */
/**************************************************************************
* Chew up memory bandwidth almost identically to MeasureBandwidhth(),
but
* without reporting the results.
*/
DWORD WINAPI
BandwidthSinkThread (LPVOID dwThreadNum)
{
PBYTE
pBuf1, pBuf2;
DWORD
j;
__int64
tElapsed[MAX_NUM_DATAPTS];
__int64
t1;
DWORD
cbCurXfer;
DWORD
dwCurDataPtNum;
pBuf1 = (2 * MAX_TRANSFER_SIZE * (DWORD) dwThreadNum) + pCopyBuf;
pBuf2 = MAX_TRANSFER_SIZE + pBuf1;
DbgPrint(1) ("Thread %u setting event\n", (DWORD) dwThreadNum);
SetEvent (hEnterBarrierEvent[(DWORD) dwThreadNum]);
DbgPrint(1) ("Thread %u waiting\n", (DWORD) dwThreadNum);
WaitForSingleObject (hLeaveBarrierEvent[(DWORD) dwThreadNum],
INFINITE);
/* Burn up EAX->RAM bandwidth. */
cbCurXfer = MIN_TRANSFER_SIZE;
dwCurDataPtNum = 0;
while (cbCurXfer <= MAX_TRANSFER_SIZE)
{
tElapsed[dwCurDataPtNum] = 0;
for (j = 0; j < NUM_TRIALS; j++)
{
t1 = ReadPentiumCycleCount ();
__asm
{
mov
ecx, cbCurXfer
shr
ecx, 2
cld
mov
edi, pBuf1
rep
stosd
}
tElapsed[dwCurDataPtNum] += ReadPentiumCycleCount
() - t1;
}
dwCurDataPtNum++;
cbCurXfer *= 2;
}
DbgPrint(1) ("Thread %u setting event\n", (DWORD) dwThreadNum);
SetEvent (hEnterBarrierEvent[(DWORD) dwThreadNum]);
DbgPrint(1) ("Thread %u waiting\n", (DWORD) dwThreadNum);
WaitForSingleObject (hLeaveBarrierEvent[(DWORD) dwThreadNum],
INFINITE);
/* Burn up RAM->EAX bandwidth. */
cbCurXfer = MIN_TRANSFER_SIZE;
dwCurDataPtNum = 0;
while (cbCurXfer <= MAX_TRANSFER_SIZE)
{
tElapsed[dwCurDataPtNum] = 0;
for (j = 0; j < NUM_TRIALS; j++)
{
t1 = ReadPentiumCycleCount ();
__asm
{
mov
ecx, cbCurXfer
shr
ecx, 2
cld
mov
esi, pBuf2
rep
lodsd
}
tElapsed[dwCurDataPtNum] += ReadPentiumCycleCount
() - t1;
}
dwCurDataPtNum++;
cbCurXfer *= 2;
}
DbgPrint(1) ("Thread %u setting event\n", (DWORD) dwThreadNum);
SetEvent (hEnterBarrierEvent[(DWORD) dwThreadNum]);
DbgPrint(1) ("Thread %u waiting\n", (DWORD) dwThreadNum);
WaitForSingleObject (hLeaveBarrierEvent[(DWORD) dwThreadNum],
INFINITE);
/* Burn up RAM->RAM bandwidth. */
cbCurXfer = MIN_TRANSFER_SIZE;
dwCurDataPtNum = 0;
while (cbCurXfer <= MAX_TRANSFER_SIZE)
{
tElapsed[dwCurDataPtNum] = 0;
for (j = 0; j < NUM_TRIALS; j++)
{
t1 = ReadPentiumCycleCount ();
__asm
{
mov
ecx, cbCurXfer
shr
ecx, 2
cld
mov
esi, pBuf1
mov
edi, pBuf2
rep
movsd
}
tElapsed[dwCurDataPtNum] += ReadPentiumCycleCount
() - t1;
}
dwCurDataPtNum++;
cbCurXfer *= 2;
}
DbgPrint(1) ("Thread %u setting event\n", (DWORD) dwThreadNum);
SetEvent (hEnterBarrierEvent[(DWORD) dwThreadNum]);
DbgPrint(1) ("Thread %u waiting\n", (DWORD) dwThreadNum);
WaitForSingleObject (hLeaveBarrierEvent[(DWORD) dwThreadNum],
INFINITE);
return (0);
} /* End BandwidthSinkThread () */
/**************************************************************************
* Measure memory system bandwidth. If si.dwNumberOfProcessors
> 1, assume
* that the other processors in the system are each running
an instance of
* BandwidthSinkThread().
*/
VOID
MeasureBandwidth (VOID)
{
DWORD
dwOldPriorityClass;
PBYTE
pBuf1, pBuf2;
DWORD
j;
__int64
tElapsedEaxIn[MAX_NUM_DATAPTS];
__int64
tElapsedEaxOut[MAX_NUM_DATAPTS];
__int64
tElapsedRamRam[MAX_NUM_DATAPTS];
__int64
t1;
DWORD
cbCurXfer;
DWORD
dwCurDataPtNum;
DWORD
cDataPtsReq;
pBuf1 = pCopyBuf;
pBuf2 = MAX_TRANSFER_SIZE + pBuf1;
dwOldPriorityClass = GetPriorityClass (hProcess);
printf ("%s: Switching to real-time priority.\n", pProgName);
SetPriorityClass (hProcess, REALTIME_PRIORITY_CLASS);
/* Release the sink threads for the write bandwidth test. */
if (si.dwNumberOfProcessors > 1)
{
WaitForMultipleObjects
(
si.dwNumberOfProcessors - 1,
1 + hEnterBarrierEvent,
TRUE,
INFINITE
);
for (j = 1; j < si.dwNumberOfProcessors; j++)
SetEvent (hLeaveBarrierEvent[j]);
}
cbCurXfer = MIN_TRANSFER_SIZE;
dwCurDataPtNum = 0;
while (cbCurXfer <= MAX_TRANSFER_SIZE)
{
tElapsedEaxOut[dwCurDataPtNum] = 0;
for (j = 0; j < NUM_TRIALS; j++)
{
t1 = ReadPentiumCycleCount ();
__asm
{
mov
ecx, cbCurXfer
shr
ecx, 2
cld
mov
edi, pBuf1
rep
stosd
}
tElapsedEaxOut[dwCurDataPtNum] +=
ReadPentiumCycleCount () - t1;
}
dwCurDataPtNum++;
cbCurXfer *= 2;
}
/* Release the sink threads for the read bandwidth test. */
if (si.dwNumberOfProcessors > 1)
{
WaitForMultipleObjects
(
si.dwNumberOfProcessors - 1,
1 + hEnterBarrierEvent,
TRUE,
INFINITE
);
for (j = 1; j < si.dwNumberOfProcessors; j++)
SetEvent (hLeaveBarrierEvent[j]);
}
cbCurXfer = MIN_TRANSFER_SIZE;
dwCurDataPtNum = 0;
while (cbCurXfer <= MAX_TRANSFER_SIZE)
{
DbgPrint(1) ("%s: Waiting\n", pProgName);
tElapsedEaxIn[dwCurDataPtNum] = 0;
for (j = 0; j < NUM_TRIALS; j++)
{
t1 = ReadPentiumCycleCount ();
__asm
{
mov
ecx, cbCurXfer
shr
ecx, 2
cld
mov
esi, pBuf2
rep
lodsd
}
tElapsedEaxIn[dwCurDataPtNum] +=
ReadPentiumCycleCount () - t1;
}
dwCurDataPtNum++;
cbCurXfer *= 2;
}
/* Release the sink threads for the read+write bandwidth test. */
if (si.dwNumberOfProcessors > 1)
{
WaitForMultipleObjects
(
si.dwNumberOfProcessors - 1,
1 + hEnterBarrierEvent,
TRUE,
INFINITE
);
for (j = 1; j < si.dwNumberOfProcessors; j++)
SetEvent (hLeaveBarrierEvent[j]);
}
cbCurXfer = MIN_TRANSFER_SIZE;
dwCurDataPtNum = 0;
while (cbCurXfer <= MAX_TRANSFER_SIZE)
{
tElapsedRamRam[dwCurDataPtNum] = 0;
for (j = 0; j < NUM_TRIALS; j++)
{
t1 = ReadPentiumCycleCount ();
__asm
{
mov
ecx, cbCurXfer
shr
ecx, 2
cld
mov
esi, pBuf1
mov
edi, pBuf2
rep
movsd
}
tElapsedRamRam[dwCurDataPtNum] +=
ReadPentiumCycleCount () - t1;
}
dwCurDataPtNum++;
cbCurXfer *= 2;
}
/* Let the sink threads terminate. */
if (si.dwNumberOfProcessors > 1)
{
WaitForMultipleObjects
(
si.dwNumberOfProcessors - 1,
1 + hEnterBarrierEvent,
TRUE,
INFINITE
);
for (j = 1; j < si.dwNumberOfProcessors; j++)
SetEvent (hLeaveBarrierEvent[j]);
}
SetPriorityClass (hProcess, dwOldPriorityClass);
printf ("%s: Back to normal priority.\n\n", pProgName);
printf ("Xfer size (B)\tEAX->RAM\tRAM->EAX\tRAM->RAM\n");
cDataPtsReq = 1 + log2 (MAX_TRANSFER_SIZE) - log2 (MIN_TRANSFER_SIZE);
cbCurXfer = MIN_TRANSFER_SIZE;
for (j = 0; j < cDataPtsReq; j++, cbCurXfer *= 2)
printf ("%13lu\t%8.2f\t%8.2f\t%8.2f\n",
cbCurXfer,
si.dwNumberOfProcessors *
((double) NUM_TRIALS * cbCurXfer * dwCpuFreqHz) /
(tElapsedEaxOut[j] * (1 * MB)),
si.dwNumberOfProcessors *
((double) NUM_TRIALS * cbCurXfer * dwCpuFreqHz) /
(tElapsedEaxIn[j] * (1 * MB)),
si.dwNumberOfProcessors *
2 * ((double) NUM_TRIALS * cbCurXfer * dwCpuFreqHz) /
(tElapsedRamRam[j] * (1 * MB)));
} /* End MeasureBandwidth () */
/**************************************************************************
* main()
*/
int main (int argc, char *argv[])
{
DWORD
j;
DWORD
dwThreadId;
DWORD
cDataPtsReq;
pProgName = *argv;
if (!EstimatePentiumClock (&dwCpuFreqHz))
{
printf ("%s: EstimatePentiumClock() failed\n",
pProgName);
exit (1);
}
printf ("%s: CPU clock frequency == %u MHz\n", pProgName,
dwCpuFreqHz);
dwCpuFreqHz *= 1000000;
GetSystemInfo (&si);
printf ("%s: Number of processors == %u\n", pProgName,
si.dwNumberOfProcessors);
if (si.dwNumberOfProcessors > MAX_NUM_CPUS)
{
printf ("%s: Number of processors in system
exceeds compile-time "
"constant.\n", pProgName);
exit (1);
}
/* Determine the space we need to record each memory characteristic. */
cDataPtsReq = 1 + log2 (MAX_TRANSFER_SIZE) - log2 (MIN_TRANSFER_SIZE);
if (cDataPtsReq > MAX_NUM_DATAPTS - 1)
{
printf ("%s: Recompile with MAX_NUM_DATAPTS
>= %u\n", pProgName,
cDataPtsReq);
exit (1);
}
/* Make sure we can use all of the CPUs in the system. */
hProcess = GetCurrentProcess ();
if (!GetProcessAffinityMask (hProcess, &dwProcAffMask,
&dwSysAffMask))
{
printf ("%s: GetProcessAffinityMask()
failed.\n", pProgName);
exit (1);
}
if (bitcount (dwProcAffMask) < si.dwNumberOfProcessors)
{
printf ("%s: Not all of the system's CPUs
are available.\n");
exit (1);
}
/* Pin down enough space so that all CPUs can copy
MAX_TRANSFER_SIZE */
/* bytes from one buffer to another simultaneously
without paging. */
cbPinSize =
SetMaxPinSize (2 * si.dwNumberOfProcessors *
MAX_TRANSFER_SIZE);
if (!cbPinSize)
{
printf ("%s: SetMaxPinSize() failed.\n",
pProgName);
exit (1);
}
pCopyBuf = VirtualAlloc /*
pCopyBuf is aligned on 64 k boundary. */
(
NULL,
cbPinSize,
MEM_COMMIT | MEM_RESERVE,
PAGE_READWRITE
);
if (!pCopyBuf)
{
printf ("%s: VirtualAlloc() failed.\n",
pProgName);
exit (1);
}
if (!VirtualLock (pCopyBuf, cbPinSize))
{
printf ("%s: VirtualLock() failed (%d).\n",
pProgName,
GetLastError ());
exit (1);
}
printf ("%s: Pinned down %u bytes.\n", pProgName, cbPinSize);
/* On multiprocessor systems, run a sink thread on
n - 1 CPUs while */
/* the main thread runs on CPU 0, measuring aggregate
bandwidth. */
for (j = 1; j < si.dwNumberOfProcessors; j++)
{
hLeaveBarrierEvent[j] = CreateEvent (NULL, FALSE,
FALSE, NULL);
if (!hLeaveBarrierEvent[j])
{
printf ("%s: CreateEvent()
failed (%u).\n", pProgName,
GetLastError ());
exit (1);
}
hEnterBarrierEvent[j] = CreateEvent (NULL, FALSE, FALSE, NULL);
if (!hEnterBarrierEvent[j])
{
printf ("%s: CreateEvent()
failed (%u).\n", pProgName,
GetLastError ());
exit (1);
}
hThread[j] = CreateThread
(
NULL,
0,
/* Default stack size. */
BandwidthSinkThread,
(LPVOID) j,
0,
&dwThreadId
);
if (!hThread[j])
{
printf ("%s: CreateThread()
failed (%u).\n", pProgName,
GetLastError ());
exit (1);
}
if (!SetThreadAffinityMask (hThread[j], 1 <<
j))
{
printf ("%s: SetThreadAffinityMask()
failed (%u).\n", pProgName,
GetLastError ());
exit (1);
}
}
/* Measure aggregate memory system bandwidth. */
SetThreadAffinityMask (GetCurrentThread (), 1);
printf ("%s: \"RAM->RAM\" indicates aggregate read+write
bandwidth.\n",
pProgName);
if (si.dwNumberOfProcessors > 1)
{
printf ("%s: Measuring memory bandwidth
with %u threads.\n",
pProgName, si.dwNumberOfProcessors);
MeasureBandwidth ();
}
/* Measure memory bandwidth with only one CPU beating on it. */
printf ("\n%s: Measuring memory bandwidth with 1 thread.\n", pProgName);
si.dwNumberOfProcessors = 1;
MeasureBandwidth ();
return (0);
}