/**************************************************************************
*
*  Memory Bandwidth test for Windows NT, version 1.0
*  Copyright 1998 The Board of Trustees of the University of Illinois
*  All rights reserved.
*
*  Contributors:  Andrew Chien and M. B. Buchanan, Department of Computer
*  Science, University of Illinois at Urbana-Champaign
*
*  Redistribution and use in source and binary forms, with or without
*  modification, are permitted provided that the following conditions are
*  met:
*
*  1.  Redistributions of source code must retain the above copyright
*  notice, this list of conditions and the following disclaimer.
*
*  2.  Redistributions in binary form must reproduce the above copyright
*  notice, this list of conditions and the following disclaimer in the
*  documentation and/or materials provided with the distribution.
*
*  3.  In addition, redistributions of modified forms of the source or
*  binary code must carry prominent notices stating that the original code
*  was changed and the date of the change.
*
*  4.  All publications or advertising materials mentioning features or use
*  of this software must acknowledge that it was developed by the
*  University of Illinois at Urbana-Champaign and credit the contributors.
*
*  5.  Neither the name of the University nor the names of the contributors
*  may be used to endorse or promote products derived from this software
*  without specific prior written permission from the University.
*
*  6.  THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY AND THE CONTRIBUTORS "AS
*  IS" WITH NO WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED.  In no
*  event shall the University or the contributors be liable for any damages
*  suffered by the users arising out of the use of this software, even if
*  advised of the possibility of such damage.
*
**************************************************************************/

/**************************************************************************
*  File   :  winmembw.c
*  Author :  Matt Buchanan
*  Date   :  1 October 1997
*
*  Summary:  Memory bandwidth test for Windows NT
**************************************************************************/

#ifndef _M_IX86
#error This program uses Pentium-specific code.
#endif

#include <windows.h>
#include <stdio.h>
#include <stdlib.h>

#define DEBUG_PRINT_MAX       1

#ifdef DEBUG
#define DbgPrint(n)     if (n <= DEBUG_PRINT_MAX) printf
#else
#define DbgPrint(n)     if (0) printf
#endif

#define MB                    1048576
#define PIN_OVERHEAD_BYTES    (1 * MB)

#define NUM_TRIALS            200
#define MIN_TRANSFER_SIZE     8
#define MAX_TRANSFER_SIZE     (4 * MB)

#define MAX_NUM_CPUS          16
#define MAX_NUM_DATAPTS       25

HANDLE                        hLeaveBarrierEvent[MAX_NUM_CPUS];
HANDLE                        hEnterBarrierEvent[MAX_NUM_CPUS];
HANDLE                        hThread[MAX_NUM_CPUS];
PBYTE                         pCopyBuf;

HANDLE                        hProcess;
SYSTEM_INFO                   si;
DWORD                         dwProcAffMask;
DWORD                         dwSysAffMask;
DWORD                         cbPinSize;
DWORD                         dwCpuFreqHz;
PCHAR                         pProgName;
 

/**************************************************************************
*  Return floor (log2 (n)).
*/

DWORD
log2 (DWORD n)
{
  DWORD count = 0;

  while (n >>= 1)
    count++;

  return (count);
}  /*  End log2 ()  */
 

/**************************************************************************
*  ReadPentiumCycleCount() function returns the value of the Time Stamp
*  Counter register on Pentium (and later) CPUs.
*/

#ifndef _M_IX86
#error ReadPentiumCycleCount() requires an Intel CPU.
#else
#pragma warning (disable : 4033)       /*  Disable the 'function should  */
#pragma warning (disable : 4035)       /*  return a value' warning.      */
__inline __int64
ReadPentiumCycleCount (VOID)
{
  __asm
  {
    _emit   0fh                          /*  rdtsc instruction      */
    _emit   31h                          /*  edx:eax = cycle count  */
  }
}
#pragma warning (default : 4033)
#pragma warning (default : 4035)
#endif
 

/**************************************************************************
*  EstimatePentiumClock() returns NT's estimate of the CPU clock frequency
*  in MHz, accurate to within about 1 MHz.
*
*  Input:
*    pEstFreq
*      Location that will receive the clock frequency calculation.
*
*  Return:
*    TRUE on success
*/

#pragma comment (lib, "advapi32")

BOOL
EstimatePentiumClock (LPDWORD pEstFreq)
{
  HKEY                        hKey;
  DWORD                       cbBuffer;
  LONG                        rc;

  rc = RegOpenKeyEx
       (
         HKEY_LOCAL_MACHINE,
         "Hardware\\Description\\System\\CentralProcessor\\0",
         0,
         KEY_READ,
         &hKey
       );

  if (rc == ERROR_SUCCESS)
  {
    cbBuffer = sizeof (DWORD);
    rc = RegQueryValueEx
         (
           hKey,
           "~MHz",
           NULL,
           NULL,
           (LPBYTE) pEstFreq,
           &cbBuffer
         );

    RegCloseKey (hKey);
  }

  return (rc == ERROR_SUCCESS);
}  /*  End EstimatePentiumClock()  */
 

/**************************************************************************
*  Return the number of set bits in n.
*/

DWORD
bitcount (DWORD n)
{
  DWORD count;

  for (count = 0; n; count++)
    n &= n - 1;

  return (count);
}  /*  End bitcount ()  */
 

/**************************************************************************
*  SetMaxPinSize() attempts to pin down cbMaxRequired bytes in physical
*  memory.  Administrator privileges are required if cbMaxRequired is
*  very large.  Return value is the number of bytes actually pinned.
*/

DWORD
SetMaxPinSize (DWORD cbMaxRequired)
{
  DWORD                       dwTrySize;
  HANDLE                      hProcess;
  BOOL                        bSuccess;
  DWORD                       dwLastError;

  hProcess = GetCurrentProcess ();

  dwTrySize = cbMaxRequired;

  do
  {
    bSuccess = SetProcessWorkingSetSize
               (
                 hProcess,
                 dwTrySize + PIN_OVERHEAD_BYTES,
                 dwTrySize + PIN_OVERHEAD_BYTES
               );

    if (!bSuccess)
    {
      dwLastError = GetLastError ();

      switch (dwLastError)
      {
        case ERROR_NO_SYSTEM_RESOURCES:
          dwTrySize -= 1 * MB;
          break;

        case ERROR_PRIVILEGE_NOT_HELD:
          printf ("%s:  Insufficient privileges for SetMaxPinSize().\n",
                  pProgName);
          return (0);

        default:
          printf ("%s:  Unexpected error (%d) from "
                  "SetProcessWorkingSetSize().\n", pProgName, dwLastError);
          return (0);
      }
    }
    else
      dwLastError = 0;
  }
  while ((dwTrySize / (1 * MB)) && (dwLastError == ERROR_NO_SYSTEM_RESOURCES));

  return (dwTrySize);
}  /*  End SetMaxPinSize ()  */
 

/**************************************************************************
*  Chew up memory bandwidth almost identically to MeasureBandwidhth(), but
*  without reporting the results.
*/

DWORD WINAPI
BandwidthSinkThread (LPVOID dwThreadNum)
{
  PBYTE                       pBuf1, pBuf2;
  DWORD                       j;
  __int64                     tElapsed[MAX_NUM_DATAPTS];
  __int64                     t1;
  DWORD                       cbCurXfer;
  DWORD                       dwCurDataPtNum;

  pBuf1 = (2 * MAX_TRANSFER_SIZE * (DWORD) dwThreadNum) + pCopyBuf;
  pBuf2 = MAX_TRANSFER_SIZE + pBuf1;

  DbgPrint(1) ("Thread %u setting event\n", (DWORD) dwThreadNum);
  SetEvent (hEnterBarrierEvent[(DWORD) dwThreadNum]);
  DbgPrint(1) ("Thread %u waiting\n", (DWORD) dwThreadNum);
  WaitForSingleObject (hLeaveBarrierEvent[(DWORD) dwThreadNum], INFINITE);

  /*  Burn up EAX->RAM bandwidth.  */

  cbCurXfer = MIN_TRANSFER_SIZE;
  dwCurDataPtNum = 0;
  while (cbCurXfer <= MAX_TRANSFER_SIZE)
  {
    tElapsed[dwCurDataPtNum] = 0;

    for (j = 0; j < NUM_TRIALS; j++)
    {
      t1 = ReadPentiumCycleCount ();

      __asm
      {
        mov    ecx, cbCurXfer
        shr    ecx, 2
        cld
        mov    edi, pBuf1
        rep    stosd
      }

      tElapsed[dwCurDataPtNum] += ReadPentiumCycleCount () - t1;
    }

    dwCurDataPtNum++;
    cbCurXfer *= 2;
  }

  DbgPrint(1) ("Thread %u setting event\n", (DWORD) dwThreadNum);
  SetEvent (hEnterBarrierEvent[(DWORD) dwThreadNum]);
  DbgPrint(1) ("Thread %u waiting\n", (DWORD) dwThreadNum);
  WaitForSingleObject (hLeaveBarrierEvent[(DWORD) dwThreadNum], INFINITE);

  /*  Burn up RAM->EAX bandwidth.  */

  cbCurXfer = MIN_TRANSFER_SIZE;
  dwCurDataPtNum = 0;
  while (cbCurXfer <= MAX_TRANSFER_SIZE)
  {
    tElapsed[dwCurDataPtNum] = 0;

    for (j = 0; j < NUM_TRIALS; j++)
    {
      t1 = ReadPentiumCycleCount ();

      __asm
      {
        mov    ecx, cbCurXfer
        shr    ecx, 2
        cld
        mov    esi, pBuf2
        rep    lodsd
      }

      tElapsed[dwCurDataPtNum] += ReadPentiumCycleCount () - t1;
    }

    dwCurDataPtNum++;
    cbCurXfer *= 2;
  }

  DbgPrint(1) ("Thread %u setting event\n", (DWORD) dwThreadNum);
  SetEvent (hEnterBarrierEvent[(DWORD) dwThreadNum]);
  DbgPrint(1) ("Thread %u waiting\n", (DWORD) dwThreadNum);
  WaitForSingleObject (hLeaveBarrierEvent[(DWORD) dwThreadNum], INFINITE);

  /*  Burn up RAM->RAM bandwidth.  */

  cbCurXfer = MIN_TRANSFER_SIZE;
  dwCurDataPtNum = 0;
  while (cbCurXfer <= MAX_TRANSFER_SIZE)
  {
    tElapsed[dwCurDataPtNum] = 0;

    for (j = 0; j < NUM_TRIALS; j++)
    {
      t1 = ReadPentiumCycleCount ();

      __asm
      {
        mov    ecx, cbCurXfer
        shr    ecx, 2
        cld
        mov    esi, pBuf1
        mov    edi, pBuf2
        rep    movsd
      }

      tElapsed[dwCurDataPtNum] += ReadPentiumCycleCount () - t1;
    }

    dwCurDataPtNum++;
    cbCurXfer *= 2;
  }

  DbgPrint(1) ("Thread %u setting event\n", (DWORD) dwThreadNum);
  SetEvent (hEnterBarrierEvent[(DWORD) dwThreadNum]);
  DbgPrint(1) ("Thread %u waiting\n", (DWORD) dwThreadNum);
  WaitForSingleObject (hLeaveBarrierEvent[(DWORD) dwThreadNum], INFINITE);

  return (0);
}  /*  End BandwidthSinkThread ()  */
 

/**************************************************************************
*  Measure memory system bandwidth.  If si.dwNumberOfProcessors > 1, assume
*  that the other processors in the system are each running an instance of
*  BandwidthSinkThread().
*/

VOID
MeasureBandwidth (VOID)
{
  DWORD                       dwOldPriorityClass;
  PBYTE                       pBuf1, pBuf2;
  DWORD                       j;
  __int64                     tElapsedEaxIn[MAX_NUM_DATAPTS];
  __int64                     tElapsedEaxOut[MAX_NUM_DATAPTS];
  __int64                     tElapsedRamRam[MAX_NUM_DATAPTS];
  __int64                     t1;
  DWORD                       cbCurXfer;
  DWORD                       dwCurDataPtNum;
  DWORD                       cDataPtsReq;

  pBuf1 = pCopyBuf;
  pBuf2 = MAX_TRANSFER_SIZE + pBuf1;

  dwOldPriorityClass = GetPriorityClass (hProcess);

  printf ("%s:  Switching to real-time priority.\n", pProgName);
  SetPriorityClass (hProcess, REALTIME_PRIORITY_CLASS);

  /*  Release the sink threads for the write bandwidth test.  */

  if (si.dwNumberOfProcessors > 1)
  {
    WaitForMultipleObjects
    (
      si.dwNumberOfProcessors - 1,
      1 + hEnterBarrierEvent,
      TRUE,
      INFINITE
    );

    for (j = 1; j < si.dwNumberOfProcessors; j++)
      SetEvent (hLeaveBarrierEvent[j]);
  }

  cbCurXfer = MIN_TRANSFER_SIZE;
  dwCurDataPtNum = 0;
  while (cbCurXfer <= MAX_TRANSFER_SIZE)
  {
    tElapsedEaxOut[dwCurDataPtNum] = 0;

    for (j = 0; j < NUM_TRIALS; j++)
    {
      t1 = ReadPentiumCycleCount ();

      __asm
      {
        mov    ecx, cbCurXfer
        shr    ecx, 2
        cld
        mov    edi, pBuf1
        rep    stosd
      }

      tElapsedEaxOut[dwCurDataPtNum] += ReadPentiumCycleCount () - t1;
    }

    dwCurDataPtNum++;
    cbCurXfer *= 2;
  }

  /*  Release the sink threads for the read bandwidth test.  */

  if (si.dwNumberOfProcessors > 1)
  {
    WaitForMultipleObjects
    (
      si.dwNumberOfProcessors - 1,
      1 + hEnterBarrierEvent,
      TRUE,
      INFINITE
    );

    for (j = 1; j < si.dwNumberOfProcessors; j++)
      SetEvent (hLeaveBarrierEvent[j]);
  }

  cbCurXfer = MIN_TRANSFER_SIZE;
  dwCurDataPtNum = 0;
  while (cbCurXfer <= MAX_TRANSFER_SIZE)
  {
    DbgPrint(1) ("%s:  Waiting\n", pProgName);

    tElapsedEaxIn[dwCurDataPtNum] = 0;

    for (j = 0; j < NUM_TRIALS; j++)
    {
      t1 = ReadPentiumCycleCount ();

      __asm
      {
        mov    ecx, cbCurXfer
        shr    ecx, 2
        cld
        mov    esi, pBuf2
        rep    lodsd
      }

      tElapsedEaxIn[dwCurDataPtNum] += ReadPentiumCycleCount () - t1;
    }

    dwCurDataPtNum++;
    cbCurXfer *= 2;
  }

  /*  Release the sink threads for the read+write bandwidth test.  */

  if (si.dwNumberOfProcessors > 1)
  {
    WaitForMultipleObjects
    (
      si.dwNumberOfProcessors - 1,
      1 + hEnterBarrierEvent,
      TRUE,
      INFINITE
    );

    for (j = 1; j < si.dwNumberOfProcessors; j++)
      SetEvent (hLeaveBarrierEvent[j]);
  }

  cbCurXfer = MIN_TRANSFER_SIZE;
  dwCurDataPtNum = 0;
  while (cbCurXfer <= MAX_TRANSFER_SIZE)
  {
    tElapsedRamRam[dwCurDataPtNum] = 0;

    for (j = 0; j < NUM_TRIALS; j++)
    {
      t1 = ReadPentiumCycleCount ();

      __asm
      {
        mov    ecx, cbCurXfer
        shr    ecx, 2
        cld
        mov    esi, pBuf1
        mov    edi, pBuf2
        rep    movsd
      }

      tElapsedRamRam[dwCurDataPtNum] += ReadPentiumCycleCount () - t1;
    }

    dwCurDataPtNum++;
    cbCurXfer *= 2;
  }

  /*  Let the sink threads terminate.  */

  if (si.dwNumberOfProcessors > 1)
  {
    WaitForMultipleObjects
    (
      si.dwNumberOfProcessors - 1,
      1 + hEnterBarrierEvent,
      TRUE,
      INFINITE
    );

    for (j = 1; j < si.dwNumberOfProcessors; j++)
      SetEvent (hLeaveBarrierEvent[j]);
  }

  SetPriorityClass (hProcess, dwOldPriorityClass);
  printf ("%s:  Back to normal priority.\n\n", pProgName);

  printf ("Xfer size (B)\tEAX->RAM\tRAM->EAX\tRAM->RAM\n");

  cDataPtsReq = 1 + log2 (MAX_TRANSFER_SIZE) - log2 (MIN_TRANSFER_SIZE);
  cbCurXfer = MIN_TRANSFER_SIZE;

  for (j = 0; j < cDataPtsReq; j++, cbCurXfer *= 2)
    printf ("%13lu\t%8.2f\t%8.2f\t%8.2f\n",
             cbCurXfer,
             si.dwNumberOfProcessors *
               ((double) NUM_TRIALS * cbCurXfer * dwCpuFreqHz) /
               (tElapsedEaxOut[j] * (1 * MB)),
             si.dwNumberOfProcessors *
               ((double) NUM_TRIALS * cbCurXfer * dwCpuFreqHz) /
               (tElapsedEaxIn[j] * (1 * MB)),
             si.dwNumberOfProcessors *
               2 * ((double) NUM_TRIALS * cbCurXfer * dwCpuFreqHz) /
               (tElapsedRamRam[j] * (1 * MB)));
}  /*  End MeasureBandwidth ()  */
 

/**************************************************************************
*  main()
*/

int main (int argc, char *argv[])
{
  DWORD                       j;
  DWORD                       dwThreadId;
  DWORD                       cDataPtsReq;

  pProgName = *argv;

  if (!EstimatePentiumClock (&dwCpuFreqHz))
  {
    printf ("%s:  EstimatePentiumClock() failed\n", pProgName);
    exit (1);
  }

  printf ("%s:  CPU clock frequency == %u MHz\n", pProgName, dwCpuFreqHz);
  dwCpuFreqHz *= 1000000;

  GetSystemInfo (&si);

  printf ("%s:  Number of processors == %u\n", pProgName,
          si.dwNumberOfProcessors);

  if (si.dwNumberOfProcessors > MAX_NUM_CPUS)
  {
    printf ("%s:  Number of processors in system exceeds compile-time "
            "constant.\n", pProgName);
    exit (1);
  }

  /*  Determine the space we need to record each memory characteristic.  */

  cDataPtsReq = 1 + log2 (MAX_TRANSFER_SIZE) - log2 (MIN_TRANSFER_SIZE);
  if (cDataPtsReq > MAX_NUM_DATAPTS - 1)
  {
    printf ("%s:  Recompile with MAX_NUM_DATAPTS >= %u\n", pProgName,
            cDataPtsReq);
    exit (1);
  }

  /*  Make sure we can use all of the CPUs in the system.  */

  hProcess = GetCurrentProcess ();

  if (!GetProcessAffinityMask (hProcess, &dwProcAffMask, &dwSysAffMask))
  {
    printf ("%s:  GetProcessAffinityMask() failed.\n", pProgName);
    exit (1);
  }

  if (bitcount (dwProcAffMask) < si.dwNumberOfProcessors)
  {
    printf ("%s:  Not all of the system's CPUs are available.\n");
    exit (1);
  }

  /*  Pin down enough space so that all CPUs can copy MAX_TRANSFER_SIZE  */
  /*  bytes from one buffer to another simultaneously without paging.    */

  cbPinSize =
    SetMaxPinSize (2 * si.dwNumberOfProcessors * MAX_TRANSFER_SIZE);

  if (!cbPinSize)
  {
    printf ("%s:  SetMaxPinSize() failed.\n", pProgName);
    exit (1);
  }

  pCopyBuf = VirtualAlloc     /*  pCopyBuf is aligned on 64 k boundary.  */
             (
               NULL,
               cbPinSize,
               MEM_COMMIT | MEM_RESERVE,
               PAGE_READWRITE
             );

  if (!pCopyBuf)
  {
    printf ("%s:  VirtualAlloc() failed.\n", pProgName);
    exit (1);
  }

  if (!VirtualLock (pCopyBuf, cbPinSize))
  {
    printf ("%s:  VirtualLock() failed (%d).\n", pProgName,
            GetLastError ());
    exit (1);
  }

  printf ("%s:  Pinned down %u bytes.\n", pProgName, cbPinSize);

  /*  On multiprocessor systems, run a sink thread on n - 1 CPUs while  */
  /*  the main thread runs on CPU 0, measuring aggregate bandwidth.     */

  for (j = 1; j < si.dwNumberOfProcessors; j++)
  {
    hLeaveBarrierEvent[j] = CreateEvent (NULL, FALSE, FALSE, NULL);

    if (!hLeaveBarrierEvent[j])
    {
      printf ("%s:  CreateEvent() failed (%u).\n", pProgName,
              GetLastError ());
      exit (1);
    }

    hEnterBarrierEvent[j] = CreateEvent (NULL, FALSE, FALSE, NULL);

    if (!hEnterBarrierEvent[j])
    {
      printf ("%s:  CreateEvent() failed (%u).\n", pProgName,
              GetLastError ());
      exit (1);
    }

    hThread[j] = CreateThread
                 (
                   NULL,
                   0,                           /*  Default stack size.  */
                   BandwidthSinkThread,
                   (LPVOID) j,
                   0,
                   &dwThreadId
                 );

    if (!hThread[j])
    {
      printf ("%s:  CreateThread() failed (%u).\n", pProgName,
              GetLastError ());
      exit (1);
    }

    if (!SetThreadAffinityMask (hThread[j], 1 << j))
    {
      printf ("%s:  SetThreadAffinityMask() failed (%u).\n", pProgName,
              GetLastError ());
      exit (1);
    }
  }

  /*  Measure aggregate memory system bandwidth.  */

  SetThreadAffinityMask (GetCurrentThread (), 1);

  printf ("%s:  \"RAM->RAM\" indicates aggregate read+write bandwidth.\n",
          pProgName);

  if (si.dwNumberOfProcessors > 1)
  {
    printf ("%s:  Measuring memory bandwidth with %u threads.\n",
            pProgName, si.dwNumberOfProcessors);

    MeasureBandwidth ();
  }

  /*  Measure memory bandwidth with only one CPU beating on it.  */

  printf ("\n%s:  Measuring memory bandwidth with 1 thread.\n", pProgName);

  si.dwNumberOfProcessors = 1;
  MeasureBandwidth ();

  return (0);
}