[Yt-svn] yt-commit r1032 - trunk/yt/lagos

Tue Dec 23 21:24:06 PST 2008

Author: mturk
Date: Tue Dec 23 21:24:06 2008
New Revision: 1032
URL: http://yt.spacepope.org/changeset/1032

Log:
Adding _cudaIsBound to DerivedQuantities.  If it's able to import pycuda and it
finds at least one available device, gravitational binding energy will be
calculated with a CUDA routine.  In my tests I saw speedups by a factor of
about 140, with relative difference between CUDA and C on the order of 1e-7.

There's a good chance that the import structure here will be updated, but for
now this should handle it appropriately.

Additionally, my CUDA code is likely very naive, and I bet the arrays could be
moved on and off more easily if I used GPUArray.



Modified:
   trunk/yt/lagos/DerivedQuantities.py

Modified: trunk/yt/lagos/DerivedQuantities.py
==============================================================================

--- trunk/yt/lagos/DerivedQuantities.py	(original)
+++ trunk/yt/lagos/DerivedQuantities.py	Tue Dec 23 21:24:06 2008
@@ -102,7 +102,7 @@
     def keys(self):
         return self.functions.keys()
 
-def _TotalMass(self, data):
+def _TotalMass(data):
     """
     This function takes no arguments and returns the sum of cell masses and
     particle masses in the object.
@@ -227,15 +227,111 @@
     # We only divide once here because we have velocity in cgs, but radius is
     # in code.
     G = 6.67e-8 / data.convert("cm") # cm^3 g^-1 s^-2
-    pot = 2*G*PointCombine.FindBindingEnergy(data["CellMass"],
-                                  data['x'],data['y'],data['z'],
-                                  truncate, kinetic/(2*G))
+    import time
+    t1 = time.time()
+    try:
+        pot = 2*G*_cudaIsBound(data, truncate, kinetic/(2*G))
+    except (ImportError, AssertionError):
+        pot = 2*G*PointCombine.FindBindingEnergy(data["CellMass"],
+                                      data['x'],data['y'],data['z'],
+                                      False, kinetic/(2*G))
+    mylog.info("Boundedness check took %0.3e seconds", time.time()-t1)
     return [(pot / kinetic)]
 def _combIsBound(data, bound):
     return bound
 add_quantity("IsBound",function=_IsBound,combine_function=_combIsBound,n_ret=1,
              force_unlazy=True)
 
+def _cudaIsBound(data, truncate, ratio):
+    import pycuda.driver as cuda
+    import pycuda.autoinit
+    import pycuda.gpuarray as gpuarray
+    my_stream = cuda.Stream()
+    cuda.init()
+    assert cuda.Device.count() >= 1
+
+    # Now the tedious process of rescaling our values...
+    length_scale_factor = data['dx'].max()/data['dx'].min()
+    mass_scale_factor = 1.0/(data['CellMass'].max())
+    x = ((data['x'] - data['x'].min()) * length_scale_factor).astype('float32')
+    y = ((data['y'] - data['y'].min()) * length_scale_factor).astype('float32')
+    z = ((data['z'] - data['z'].min()) * length_scale_factor).astype('float32')
+    m = (data['CellMass'] * mass_scale_factor).astype('float32')
+    p = na.zeros(z.shape, dtype='float32')
+    x_gpu = cuda.mem_alloc(x.size * x.dtype.itemsize)
+    y_gpu = cuda.mem_alloc(y.size * y.dtype.itemsize)
+    z_gpu = cuda.mem_alloc(z.size * z.dtype.itemsize)
+    m_gpu = cuda.mem_alloc(m.size * m.dtype.itemsize)
+    p_gpu = cuda.mem_alloc(p.size * p.dtype.itemsize)
+    for ag, a in [(x_gpu, x), (y_gpu, y), (z_gpu, z), (m_gpu, m), (p_gpu, p)]:
+        cuda.memcpy_htod(ag, a)
+    source = """
+
+      extern __shared__ float array[];
+
+      __global__ void isbound(float *x, float *y, float *z, float *m,
+                              float *p)
+      {
+
+        /* My index in the array */
+        int idx1 = blockIdx.x * blockDim.x + threadIdx.x;
+        /* Note we are setting a start index */
+        int idx2 = blockIdx.y * blockDim.x;
+        int offset = threadIdx.x;
+
+        float* x_data1 = (float*) array;
+        float* y_data1 = (float*) &x_data1[blockDim.x];
+        float* z_data1 = (float*) &y_data1[blockDim.x];
+        float* m_data1 = (float*) &z_data1[blockDim.x];
+
+        float* x_data2 = (float*) &m_data1[blockDim.x];
+        float* y_data2 = (float*) &x_data2[blockDim.x];
+        float* z_data2 = (float*) &y_data2[blockDim.x];
+        float* m_data2 = (float*) &z_data2[blockDim.x];
+
+        x_data1[offset] = x[idx1];
+        y_data1[offset] = y[idx1];
+        z_data1[offset] = z[idx1];
+        m_data1[offset] = m[idx1];
+
+        x_data2[offset] = x[idx2 + offset];
+        y_data2[offset] = y[idx2 + offset];
+        z_data2[offset] = z[idx2 + offset];
+        m_data2[offset] = m[idx2 + offset];
+
+        __syncthreads();
+
+        float tx, ty, tz;
+
+        float my_p = 0.0;
+
+        for (int i = 0; i < blockDim.x; i++){
+            if(i + idx2 < idx1 + 1) continue;
+            tx = (x_data1[offset]-x_data2[i]);
+            ty = (y_data1[offset]-y_data2[i]);
+            tz = (z_data1[offset]-z_data2[i]);
+            my_p += m_data1[offset]*m_data2[i] /
+                sqrt(tx*tx+ty*ty+tz*tz);
+        }
+        p[idx1] += my_p;
+        __syncthreads();
+      }
+    """
+    bsize = 256
+    mod = cuda.SourceModule(source % dict(p=m.size, b=bsize), keep=True)
+    func = mod.get_function('isbound')
+    import math
+    gsize=int(math.ceil(float(m.size)/bsize))
+    mylog.info("Running CUDA functions.  May take a while.  (%0.5e, %s)",
+               x.size, gsize)
+    import pycuda.tools as ct
+    t1 = time.time()
+    ret = func(x_gpu, y_gpu, z_gpu, m_gpu, p_gpu,
+               shared=8*bsize*m.dtype.itemsize,
+         block=(bsize,1,1), grid=(gsize, gsize), time_kernel=True)
+    cuda.memcpy_dtoh(p, p_gpu)
+    p1 = p.sum()
+    return p1 * (length_scale_factor / (mass_scale_factor**2.0))
     
 def _Extrema(data, fields):
     """