diff --git a/examples/benchmark.py b/examples/benchmark.py index 650f3848d..629162509 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -6,9 +6,9 @@ import sys from datetime import datetime from optparse import OptionParser -def timeIntegration(context, steps): +def timeIntegration(context, steps, initialSteps): """Integrate a Context for a specified number of steps, then return how many seconds it took.""" - context.getIntegrator().step(5) # Make sure everything is fully initialized + context.getIntegrator().step(initialSteps) # Make sure everything is fully initialized context.getState(getEnergy=True) start = datetime.now() context.getIntegrator().step(steps) @@ -79,11 +79,14 @@ def runOneTest(testName, options): system = ff.createSystem(pdb.topology, nonbondedMethod=method, nonbondedCutoff=cutoff, constraints=constraints, hydrogenMass=hydrogenMass) print('Step Size: %g fs' % dt.value_in_unit(unit.femtoseconds)) properties = {} + initialSteps = 5 if options.device is not None: if platform.getName() == 'CUDA': properties['CudaDeviceIndex'] = options.device elif platform.getName() == 'OpenCL': properties['OpenCLDeviceIndex'] = options.device + if ',' in options.device or ' ' in options.device: + initialSteps = 250 if options.precision is not None: if platform.getName() == 'CUDA': properties['CudaPrecision'] = options.precision @@ -102,7 +105,7 @@ def runOneTest(testName, options): context.setVelocitiesToTemperature(300*unit.kelvin) steps = 20 while True: - time = timeIntegration(context, steps) + time = timeIntegration(context, steps, initialSteps) if time >= 0.5*options.seconds: break if time < 0.5: diff --git a/platforms/cuda/include/CudaParallelKernels.h b/platforms/cuda/include/CudaParallelKernels.h index e43299a8a..6c2498b9b 100644 --- a/platforms/cuda/include/CudaParallelKernels.h +++ b/platforms/cuda/include/CudaParallelKernels.h @@ -83,7 +83,7 @@ private: std::vector kernels; std::vector completionTimes; std::vector contextNonbondedFractions; - std::vector tileCounts; + int* tileCounts; CudaArray* contextForces; void* pinnedPositionBuffer; long long* pinnedForceBuffer; diff --git a/platforms/cuda/src/CudaParallelKernels.cpp b/platforms/cuda/src/CudaParallelKernels.cpp index eae23836b..6c024cea8 100644 --- a/platforms/cuda/src/CudaParallelKernels.cpp +++ b/platforms/cuda/src/CudaParallelKernels.cpp @@ -99,7 +99,7 @@ public: } void execute() { // Execute the kernel, then download forces. - + energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid); if (cu.getComputeForceCount() < 200) { // Record timing information for load balancing. Since this takes time, only do it at the start of the simulation. @@ -141,7 +141,7 @@ private: CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) : CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), - tileCounts(data.contexts.size()), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) { + tileCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) { for (int i = 0; i < (int) data.contexts.size(); i++) kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i]))); } @@ -156,6 +156,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel() cuMemFreeHost(pinnedForceBuffer); cuEventDestroy(event); cuStreamDestroy(peerCopyStream); + if (tileCounts != NULL) + cuMemFreeHost(tileCounts); } void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { @@ -163,12 +165,14 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { cu.setAsCurrent(); CUmodule module = cu.createModule(CudaKernelSources::parallel); sumKernel = cu.getKernel(module, "sumForces"); - for (int i = 0; i < (int) kernels.size(); i++) + int numContexts = data.contexts.size(); + for (int i = 0; i < numContexts; i++) getKernel(i).initialize(system); - for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) - contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size(); + for (int i = 0; i < numContexts; i++) + contextNonbondedFractions[i] = 1/(double) numContexts; CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event"); CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream"); + CHECK_RESULT(cuMemHostAlloc((void**) &tileCounts, numContexts*sizeof(int), 0), "Error creating tile count buffer"); } void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {