[Yt-svn] yt-commit r1089 - trunk/yt/lagos

Wed Jan 7 21:38:54 PST 2009

Author: mturk
Date: Wed Jan  7 21:38:53 2009
New Revision: 1089
URL: http://yt.spacepope.org/changeset/1089

Log:
Major improvements to clump finder.

Field caching is now available, but off by default.  This is the cached_fields
argument to the initializer of a Clump -- you have to feed this in yourself.
Something like:

defaultdict(lambda:dict())

will do it.  (When you finish with the clump finding, you should call .clear()
on this object!)

Additionally, I rewrote the queueing system for the contouring; this sped
things up (in my tests) by about a factor of six.  I should have run a profiler
on this code long ago.

Fixed the CUDA IsBound to check for small numbers of cells -- if we don't have
lots of cells, we just dump back to the C code.  This helps with cases where
you pick out small regions of cells, like in high-Jeans runs.

The change to CoveringGrid where I commented out the field assignment of
'dx','dy' and 'dz' might be problematic, but it looks valid to me and the tests
all pass.

Fixed some potential issues with empty grid cell counts in extracted regions.

All in all, I am quite pleased with the clump finder now.  It's substantially
faster and the accuracy has been retained.

I'll probably convert the format string'd cuda.SourceModule to have a parameter
passed in, but for now it's not slowing us down much.



Modified:
   trunk/yt/lagos/BaseDataTypes.py
   trunk/yt/lagos/Clump.py
   trunk/yt/lagos/ContourFinder.py
   trunk/yt/lagos/DerivedQuantities.py

Modified: trunk/yt/lagos/BaseDataTypes.py
==============================================================================

--- trunk/yt/lagos/BaseDataTypes.py	(original)
+++ trunk/yt/lagos/BaseDataTypes.py	Wed Jan  7 21:38:53 2009
@@ -360,7 +360,7 @@
             fields_to_get = self.fields
         else:
             fields_to_get = ensure_list(fields)
-        mylog.debug("Going to obtain %s (%s)", fields_to_get, self.fields)
+        mylog.debug("Going to obtain %s", fields_to_get, self.fields)
         for field in fields_to_get:
             if self.data.has_key(field):
                 continue
@@ -1445,7 +1445,7 @@
         grid_vals, xi, yi, zi = [], [], [], []
         for grid in self._base_region._grids:
             xit,yit,zit = self._base_region._get_point_indices(grid)
-            grid_vals.append(na.ones(xit.shape, dtype='int64') * grid.id-grid._id_offset)
+            grid_vals.append(na.ones(xit.shape, dtype='int') * (grid.id-grid._id_offset))
             xi.append(xit)
             yi.append(yit)
             zi.append(zit)
@@ -1456,7 +1456,7 @@
         xi = na.concatenate(xi)[self._base_indices][grid_order]
         yi = na.concatenate(yi)[self._base_indices][grid_order]
         zi = na.concatenate(zi)[self._base_indices][grid_order]
-        bc = na.bincount(grid_vals.astype("int32"))
+        bc = na.bincount(grid_vals)
         splits = []
         for i,v in enumerate(bc):
             if v > 0: splits.append(v)
@@ -1465,17 +1465,16 @@
         self._indices = {}
         for grid_id, x, y, z in zip(grid_ids, xis, yis, zis):
             # grid_id needs no offset
-            self._indices[grid_id] = (x.astype('int64'),
-                                      y.astype('int64'),
-                                      z.astype('int64'))
+            self._indices[grid_id] = (x, y, z)
         self._grids = self._base_region.pf.h.grids[self._indices.keys()]
 
     def _is_fully_enclosed(self, grid):
-        return (self._indices[grid.id-1][0].size == grid.ActiveDimensions.prod())
+        return (self._indices[grid.id-grid._id_offset][0].size == grid.ActiveDimensions.prod())
 
+    __empty_array = na.array([], dtype='bool')
     def _get_point_indices(self, grid, use_child_mask=True):
         # Yeah, if it's not true, we don't care.
-        return self._indices.get(grid.id-grid._id_offset, ())
+        return self._indices.get(grid.id-grid._id_offset, self.__empty_array)
 
 class InLineExtractedRegionBase(AMR3DData):
     """
@@ -1767,6 +1766,7 @@
         self._refresh_data()
 
     def _get_list_of_grids(self):
+        if self._grids is not None: return
         if na.any(self.left_edge < self.pf["DomainLeftEdge"]) or \
            na.any(self.right_edge > self.pf["DomainRightEdge"]):
             grids,ind = self.pf.hierarchy.get_periodic_box_grids(
@@ -1784,9 +1784,9 @@
 
     def _refresh_data(self):
         AMR3DData._refresh_data(self)
-        self['dx'] = self.dx * na.ones(self.ActiveDimensions, dtype='float64')
-        self['dy'] = self.dy * na.ones(self.ActiveDimensions, dtype='float64')
-        self['dz'] = self.dz * na.ones(self.ActiveDimensions, dtype='float64')
+        #self['dx'] = self.dx * na.ones(self.ActiveDimensions, dtype='float64')
+        #self['dy'] = self.dy * na.ones(self.ActiveDimensions, dtype='float64')
+        #self['dz'] = self.dz * na.ones(self.ActiveDimensions, dtype='float64')
 
     def get_data(self, field=None):
         self._get_list_of_grids()
@@ -1828,11 +1828,8 @@
             fields_to_get = self.fields
         else:
             fields_to_get = ensure_list(field)
-        for field in fields_to_get:
-            #mylog.debug("Flushing field %s to %s possible grids",
-                       #field, len(self._grids))
-            for grid in self._grids:
-                self._flush_data_to_grid(grid, field)
+        for grid in self._grids:
+            self._flush_data_to_grid(grid, fields_to_get)
 
     @restore_grid_state
     def _get_data_from_grid(self, grid, fields):

Modified: trunk/yt/lagos/Clump.py
==============================================================================
--- trunk/yt/lagos/Clump.py	(original)
+++ trunk/yt/lagos/Clump.py	Wed Jan  7 21:38:53 2009
@@ -28,23 +28,26 @@
 
 class Clump(object):
     children = None
-    def __init__(self, data, parent, field):
+    def __init__(self, data, parent, field, cached_fields = None):
         self.parent = parent
         self.data = data
         self.field = field
         self.min = self.data[field].min()
         self.max = self.data[field].max()
         self.isBound = None
+        self.cached_fields = cached_fields
 
     def find_children(self, min, max = None):
         if self.children is not None:
             print "Wiping out existing children clumps."
         self.children = []
         if max is None: max = self.max
-        contour_info = identify_contours(self.data, self.field, min, max)
+        contour_info = identify_contours(self.data, self.field, min, max,
+                                         self.cached_fields)
         for cid in contour_info:
             new_clump = self.data.extract_region(contour_info[cid])
-            self.children.append(Clump(new_clump, self, self.field))
+            self.children.append(Clump(new_clump, self, self.field,
+                                    self.cached_fields))
 
     def get_IsBound(self):
         if self.isBound is None:

Modified: trunk/yt/lagos/ContourFinder.py
==============================================================================
--- trunk/yt/lagos/ContourFinder.py	(original)
+++ trunk/yt/lagos/ContourFinder.py	Wed Jan  7 21:38:53 2009
@@ -26,7 +26,7 @@
 from yt.lagos import *
 
 class GridConsiderationQueue:
-    def __init__(self, white_list = None, priority_func=None):
+    def __init__(self, white_list, priority_func=None):
         """
         This class exists to serve the contour finder.  It ensures that
         we can create a cascading set of queue dependencies, and if
@@ -34,37 +34,25 @@
         of the queue again.  It like has few uses.
         """
         self.to_consider = []
-        self.considered = []
+        self.considered = set()
         self.n = 0
-        if white_list != None:
-            self.white_list = white_list
-        else:
-            self.white_list = []
+        self.white_list = set(white_list)
         self.priority_func = priority_func
 
     def add(self, grids, force=False):
-        if hasattr(grids,'size'):
-            grids = grids.tolist()
-        else:
+        if not hasattr(grids,'size'):
             grids = ensure_list(grids)
-        if self.priority_func:
-            grids.sort(key=self.priority_func)
-        else:
-            grids.sort()
-        if force:
-            for g in grids:
-                for i in range(self.considered.count(g)):
-                    del self.considered[self.considered.index(g)]
         i = self.n
-        for g in grids:
-            if g not in self.white_list: continue
-            if g in self.considered: continue
-            if g in self.to_consider[i:]:
-                del self.to_consider[i+self.to_consider[i:].index(g)]
+        to_check = self.white_list.intersection(grids)
+        if not force: to_check.difference_update(self.considered)
+        for g in sorted(to_check, key=self.priority_func):
+            try:
+                # We only delete from subsequent checks
+                del self.to_consider[self.to_consider.index(g, i)]
                 self.to_consider.insert(i,g)
                 i += 1
-                continue
-            self.to_consider.append(g)
+            except ValueError:
+                self.to_consider.append(g)
 
     def __iter__(self):
         return self
@@ -73,7 +61,7 @@
         if self.n >= len(self.to_consider):
             raise StopIteration
         tr = self.to_consider[self.n]
-        self.considered.append(tr)
+        self.considered.add(tr)
         self.n += 1
         return tr
 
@@ -101,7 +89,7 @@
     mylog.info("Contouring over %s cells with %s candidates", contour_ids[0],np)
     data_source["tempContours"][contour_ind] = contour_ids[:]
     data_source._flush_data_to_grids("tempContours", -1, dtype='int32')
-    my_queue = GridConsiderationQueue(data_source._grids,
+    my_queue = GridConsiderationQueue(white_list = data_source._grids,
                     priority_func = lambda g: -1*g["tempContours"].max())
     my_queue.add(data_source._grids)
     for i,grid in enumerate(my_queue):
@@ -129,18 +117,21 @@
             cur_max_id -= local_ind[0].size
         fd = cg["tempContours"].astype('int64')
         fd_original = fd.copy()
-        xi_u,yi_u,zi_u = na.where(fd > -1)
-        cor_order = na.argsort(-1*fd[(xi_u,yi_u,zi_u)])
-        xi = xi_u[cor_order]
-        yi = yi_u[cor_order]
-        zi = zi_u[cor_order]
-        PointCombine.FindContours(fd, xi, yi, zi)
+        if na.all(fd > -1):
+            fd[:] = fd.max()
+        else:
+            xi_u,yi_u,zi_u = na.where(fd > -1)
+            cor_order = na.argsort(-1*fd[(xi_u,yi_u,zi_u)])
+            xi = xi_u[cor_order]
+            yi = yi_u[cor_order]
+            zi = zi_u[cor_order]
+            PointCombine.FindContours(fd, xi, yi, zi)
         cg["tempContours"] = fd.copy().astype('float64')
         cg.flush_data("tempContours")
         my_queue.add(cg._grids)
         force_ind = na.unique(cg["GridIndices"][na.where(
             (cg["tempContours"] > fd_original)
-          & (cg["GridIndices"] != grid.id-1) )])
+          & (cg["GridIndices"] != grid.id-grid._id_offset) )])
         if len(force_ind) > 0:
             my_queue.add(data_source.hierarchy.grids[force_ind.astype('int32')], force=True)
         for ax in 'xyz':

Modified: trunk/yt/lagos/DerivedQuantities.py
==============================================================================
--- trunk/yt/lagos/DerivedQuantities.py	(original)
+++ trunk/yt/lagos/DerivedQuantities.py	Wed Jan  7 21:38:53 2009
@@ -27,6 +27,9 @@
 
 from yt.lagos import *
 from yt.funcs import get_pbar, wraps
+import math
+
+__CUDA_BLOCK_SIZE = 256
 
 quantity_info = {}
 
@@ -215,6 +218,8 @@
     """
     # Kinetic energy
     bv_x,bv_y,bv_z = data.quantities["BulkVelocity"]()
+    # One-cell objects are NOT BOUND.
+    if data["CellMass"].size == 1: return [0.0]
     kinetic = 0.5 * (data["CellMass"] * (
                        (data["x-velocity"] - bv_x)**2
                      + (data["y-velocity"] - bv_y)**2
@@ -234,7 +239,7 @@
     except (ImportError, AssertionError):
         pot = 2*G*PointCombine.FindBindingEnergy(data["CellMass"],
                                       data['x'],data['y'],data['z'],
-                                      False, kinetic/(2*G))
+                                      truncate, kinetic/(2*G))
     mylog.info("Boundedness check took %0.3e seconds", time.time()-t1)
     return [(pot / kinetic)]
 def _combIsBound(data, bound):
@@ -243,6 +248,7 @@
              force_unlazy=True)
 
 def _cudaIsBound(data, truncate, ratio):
+    bsize = __CUDA_BLOCK_SIZE
     import pycuda.driver as cuda
     import pycuda.autoinit
     import pycuda.gpuarray as gpuarray
@@ -250,14 +256,20 @@
     cuda.init()
     assert cuda.Device.count() >= 1
 
+    mass_scale_factor = 1.0/(data['CellMass'].max())
+    m = (data['CellMass'] * mass_scale_factor).astype('float32')
+    assert(m.size > bsize)
+
+    gsize=int(math.ceil(float(m.size)/bsize))
+    assert(gsize > 16)
+
     # Now the tedious process of rescaling our values...
     length_scale_factor = data['dx'].max()/data['dx'].min()
-    mass_scale_factor = 1.0/(data['CellMass'].max())
     x = ((data['x'] - data['x'].min()) * length_scale_factor).astype('float32')
     y = ((data['y'] - data['y'].min()) * length_scale_factor).astype('float32')
     z = ((data['z'] - data['z'].min()) * length_scale_factor).astype('float32')
-    m = (data['CellMass'] * mass_scale_factor).astype('float32')
     p = na.zeros(z.shape, dtype='float32')
+    
     x_gpu = cuda.mem_alloc(x.size * x.dtype.itemsize)
     y_gpu = cuda.mem_alloc(y.size * y.dtype.itemsize)
     z_gpu = cuda.mem_alloc(z.size * z.dtype.itemsize)
@@ -270,7 +282,7 @@
       extern __shared__ float array[];
 
       __global__ void isbound(float *x, float *y, float *z, float *m,
-                              float *p)
+                              float *p, int *nelem)
       {
 
         /* My index in the array */
@@ -305,23 +317,22 @@
 
         float my_p = 0.0;
 
-        for (int i = 0; i < blockDim.x; i++){
-            if(i + idx2 < idx1 + 1) continue;
-            tx = (x_data1[offset]-x_data2[i]);
-            ty = (y_data1[offset]-y_data2[i]);
-            tz = (z_data1[offset]-z_data2[i]);
-            my_p += m_data1[offset]*m_data2[i] /
-                sqrt(tx*tx+ty*ty+tz*tz);
+        if(idx1 < %(p)s) {
+            for (int i = 0; i < blockDim.x; i++){
+                if(i + idx2 < idx1 + 1) continue;
+                tx = (x_data1[offset]-x_data2[i]);
+                ty = (y_data1[offset]-y_data2[i]);
+                tz = (z_data1[offset]-z_data2[i]);
+                my_p += m_data1[offset]*m_data2[i] /
+                    sqrt(tx*tx+ty*ty+tz*tz);
+            }
         }
         p[idx1] += my_p;
         __syncthreads();
       }
     """
-    bsize = 256
-    mod = cuda.SourceModule(source % dict(p=m.size, b=bsize), keep=True)
+    mod = cuda.SourceModule(source % dict(p=m.size))
     func = mod.get_function('isbound')
-    import math
-    gsize=int(math.ceil(float(m.size)/bsize))
     mylog.info("Running CUDA functions.  May take a while.  (%0.5e, %s)",
                x.size, gsize)
     import pycuda.tools as ct
@@ -331,6 +342,7 @@
          block=(bsize,1,1), grid=(gsize, gsize), time_kernel=True)
     cuda.memcpy_dtoh(p, p_gpu)
     p1 = p.sum()
+    if na.any(na.isnan(p)): raise ValueError
     return p1 * (length_scale_factor / (mass_scale_factor**2.0))
     
 def _Extrema(data, fields):