mirror of
https://github.com/openmm/openmm
synced 2026-06-03 06:39:48 +09:00
Fixed a performance regression in multi-GPU on CUDA
This commit is contained in:
@@ -6,9 +6,9 @@ import sys
|
||||
from datetime import datetime
|
||||
from optparse import OptionParser
|
||||
|
||||
def timeIntegration(context, steps):
|
||||
def timeIntegration(context, steps, initialSteps):
|
||||
"""Integrate a Context for a specified number of steps, then return how many seconds it took."""
|
||||
context.getIntegrator().step(5) # Make sure everything is fully initialized
|
||||
context.getIntegrator().step(initialSteps) # Make sure everything is fully initialized
|
||||
context.getState(getEnergy=True)
|
||||
start = datetime.now()
|
||||
context.getIntegrator().step(steps)
|
||||
@@ -79,11 +79,14 @@ def runOneTest(testName, options):
|
||||
system = ff.createSystem(pdb.topology, nonbondedMethod=method, nonbondedCutoff=cutoff, constraints=constraints, hydrogenMass=hydrogenMass)
|
||||
print('Step Size: %g fs' % dt.value_in_unit(unit.femtoseconds))
|
||||
properties = {}
|
||||
initialSteps = 5
|
||||
if options.device is not None:
|
||||
if platform.getName() == 'CUDA':
|
||||
properties['CudaDeviceIndex'] = options.device
|
||||
elif platform.getName() == 'OpenCL':
|
||||
properties['OpenCLDeviceIndex'] = options.device
|
||||
if ',' in options.device or ' ' in options.device:
|
||||
initialSteps = 250
|
||||
if options.precision is not None:
|
||||
if platform.getName() == 'CUDA':
|
||||
properties['CudaPrecision'] = options.precision
|
||||
@@ -102,7 +105,7 @@ def runOneTest(testName, options):
|
||||
context.setVelocitiesToTemperature(300*unit.kelvin)
|
||||
steps = 20
|
||||
while True:
|
||||
time = timeIntegration(context, steps)
|
||||
time = timeIntegration(context, steps, initialSteps)
|
||||
if time >= 0.5*options.seconds:
|
||||
break
|
||||
if time < 0.5:
|
||||
|
||||
@@ -83,7 +83,7 @@ private:
|
||||
std::vector<Kernel> kernels;
|
||||
std::vector<long long> completionTimes;
|
||||
std::vector<double> contextNonbondedFractions;
|
||||
std::vector<int> tileCounts;
|
||||
int* tileCounts;
|
||||
CudaArray* contextForces;
|
||||
void* pinnedPositionBuffer;
|
||||
long long* pinnedForceBuffer;
|
||||
|
||||
@@ -99,7 +99,7 @@ public:
|
||||
}
|
||||
void execute() {
|
||||
// Execute the kernel, then download forces.
|
||||
|
||||
|
||||
energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
|
||||
if (cu.getComputeForceCount() < 200) {
|
||||
// Record timing information for load balancing. Since this takes time, only do it at the start of the simulation.
|
||||
@@ -141,7 +141,7 @@ private:
|
||||
|
||||
CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
|
||||
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
|
||||
tileCounts(data.contexts.size()), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
|
||||
tileCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
|
||||
for (int i = 0; i < (int) data.contexts.size(); i++)
|
||||
kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
|
||||
}
|
||||
@@ -156,6 +156,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
|
||||
cuMemFreeHost(pinnedForceBuffer);
|
||||
cuEventDestroy(event);
|
||||
cuStreamDestroy(peerCopyStream);
|
||||
if (tileCounts != NULL)
|
||||
cuMemFreeHost(tileCounts);
|
||||
}
|
||||
|
||||
void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
|
||||
@@ -163,12 +165,14 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
|
||||
cu.setAsCurrent();
|
||||
CUmodule module = cu.createModule(CudaKernelSources::parallel);
|
||||
sumKernel = cu.getKernel(module, "sumForces");
|
||||
for (int i = 0; i < (int) kernels.size(); i++)
|
||||
int numContexts = data.contexts.size();
|
||||
for (int i = 0; i < numContexts; i++)
|
||||
getKernel(i).initialize(system);
|
||||
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
|
||||
contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
|
||||
for (int i = 0; i < numContexts; i++)
|
||||
contextNonbondedFractions[i] = 1/(double) numContexts;
|
||||
CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
|
||||
CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream");
|
||||
CHECK_RESULT(cuMemHostAlloc((void**) &tileCounts, numContexts*sizeof(int), 0), "Error creating tile count buffer");
|
||||
}
|
||||
|
||||
void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
|
||||
|
||||
Reference in New Issue
Block a user