Fixed a performance regression in multi-GPU on CUDA

This commit is contained in:
peastman
2015-03-10 13:54:18 -07:00
parent 5069c6681e
commit 9d3a655b1f
3 changed files with 16 additions and 9 deletions

View File

@@ -6,9 +6,9 @@ import sys
from datetime import datetime
from optparse import OptionParser
def timeIntegration(context, steps):
def timeIntegration(context, steps, initialSteps):
"""Integrate a Context for a specified number of steps, then return how many seconds it took."""
context.getIntegrator().step(5) # Make sure everything is fully initialized
context.getIntegrator().step(initialSteps) # Make sure everything is fully initialized
context.getState(getEnergy=True)
start = datetime.now()
context.getIntegrator().step(steps)
@@ -79,11 +79,14 @@ def runOneTest(testName, options):
system = ff.createSystem(pdb.topology, nonbondedMethod=method, nonbondedCutoff=cutoff, constraints=constraints, hydrogenMass=hydrogenMass)
print('Step Size: %g fs' % dt.value_in_unit(unit.femtoseconds))
properties = {}
initialSteps = 5
if options.device is not None:
if platform.getName() == 'CUDA':
properties['CudaDeviceIndex'] = options.device
elif platform.getName() == 'OpenCL':
properties['OpenCLDeviceIndex'] = options.device
if ',' in options.device or ' ' in options.device:
initialSteps = 250
if options.precision is not None:
if platform.getName() == 'CUDA':
properties['CudaPrecision'] = options.precision
@@ -102,7 +105,7 @@ def runOneTest(testName, options):
context.setVelocitiesToTemperature(300*unit.kelvin)
steps = 20
while True:
time = timeIntegration(context, steps)
time = timeIntegration(context, steps, initialSteps)
if time >= 0.5*options.seconds:
break
if time < 0.5:

View File

@@ -83,7 +83,7 @@ private:
std::vector<Kernel> kernels;
std::vector<long long> completionTimes;
std::vector<double> contextNonbondedFractions;
std::vector<int> tileCounts;
int* tileCounts;
CudaArray* contextForces;
void* pinnedPositionBuffer;
long long* pinnedForceBuffer;

View File

@@ -99,7 +99,7 @@ public:
}
void execute() {
// Execute the kernel, then download forces.
energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
if (cu.getComputeForceCount() < 200) {
// Record timing information for load balancing. Since this takes time, only do it at the start of the simulation.
@@ -141,7 +141,7 @@ private:
CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
tileCounts(data.contexts.size()), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
tileCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
}
@@ -156,6 +156,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
cuMemFreeHost(pinnedForceBuffer);
cuEventDestroy(event);
cuStreamDestroy(peerCopyStream);
if (tileCounts != NULL)
cuMemFreeHost(tileCounts);
}
void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
@@ -163,12 +165,14 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
cu.setAsCurrent();
CUmodule module = cu.createModule(CudaKernelSources::parallel);
sumKernel = cu.getKernel(module, "sumForces");
for (int i = 0; i < (int) kernels.size(); i++)
int numContexts = data.contexts.size();
for (int i = 0; i < numContexts; i++)
getKernel(i).initialize(system);
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
for (int i = 0; i < numContexts; i++)
contextNonbondedFractions[i] = 1/(double) numContexts;
CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream");
CHECK_RESULT(cuMemHostAlloc((void**) &tileCounts, numContexts*sizeof(int), 0), "Error creating tile count buffer");
}
void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {