[Yt-svn] yt: Modifications of the merger tree.

Wed Mar 31 15:51:41 PDT 2010

hg Repository: yt
details:   yt/rev/ce261b18ec2c
changeset: 1506:ce261b18ec2c
user:      Stephen Skory <stephenskory at yahoo.com>
date:
Wed Mar 31 15:51:34 2010 -0700
description:
Modifications of the merger tree.
- Now handles datasets with no halos.
- Faster halo membership comparisons. Now the slowest thing during that
part of the process is writing to the SQLite file, so I think it's
just about as fast as it will get.
- Now it uses the fortran kd tree.

Also a few changes to halo finding stuff.

diffstat:

 yt/extensions/MergerTree.py         |  353 ++++++++++++++++++++++++++++-----------
 yt/lagos/HaloFinding.py             |   24 +-
 yt/lagos/ParallelTools.py           |   45 +++++
 yt/lagos/parallelHOP/parallelHOP.py |    3 +-
 4 files changed, 311 insertions(+), 114 deletions(-)

diffs (truncated from 681 to 300 lines):

diff -r f2e555c03efe -r ce261b18ec2c yt/extensions/MergerTree.py

--- a/yt/extensions/MergerTree.py	Wed Mar 31 10:16:28 2010 -0700
+++ b/yt/extensions/MergerTree.py	Wed Mar 31 15:51:34 2010 -0700
@@ -24,7 +24,7 @@
 """
 
 import yt.lagos as lagos
-from yt.lagos.kd import *
+from yt.extensions.kdtree import *
 from yt.lagos.HaloFinding import HaloFinder
 from yt.logger import lagosLogger as mylog
 import yt.extensions.HaloProfiler as HP
@@ -41,7 +41,7 @@
 "SnapCurrentTimeIdentifier":"INTEGER",
 "SnapZ":"FLOAT",
 "SnapHaloID":"INTEGER",
-"DarkMatterMass":"FLOAT",
+"HaloMass":"FLOAT",
 "NumPart":"INTEGER",
 "CenMassX":"FLOAT",
 "CenMassY":"FLOAT",
@@ -63,7 +63,7 @@
 
 # In order.
 columns = ["GlobalHaloID", "SnapCurrentTimeIdentifier", "SnapZ", 
-"SnapHaloID", "DarkMatterMass", "NumPart", "CenMassX", "CenMassY",
+"SnapHaloID", "HaloMass", "NumPart", "CenMassX", "CenMassY",
 "CenMassZ", "BulkVelX", "BulkVelY", "BulkVelZ", "MaxRad",
 "ChildHaloID0", "ChildHaloFrac0",
 "ChildHaloID1", "ChildHaloFrac1",
@@ -94,6 +94,7 @@
             FOF_link_length=0.2, dm_only=False, refresh=False, sleep=1,
             index=True):
         self.restart_files = restart_files # list of enzo restart files
+        self.with_halos = na.ones(len(restart_files), dtype='bool')
         self.database = database # the sqlite database of haloes.
         self.halo_finder_function = halo_finder_function # which halo finder to use
         self.halo_finder_threshold = halo_finder_threshold # overdensity threshold
@@ -121,16 +122,19 @@
         self._create_halo_table()
         self._run_halo_finder_add_to_db()
         # Find the h5 file names for all the halos.
-        mylog.info("Opening HDF5 files.")
         for snap in self.restart_files:
             self._build_h5_refs(snap)
         # Loop over the pairs of snapshots to locate likely neighbors, and
         # then use those likely neighbors to compute fractional contributions.
-        for pair in zip(self.restart_files[:-1], self.restart_files[1:]):
-            self._build_h5_refs(pair[0])
-            self._build_h5_refs(pair[1])
+        last = None
+        for snap, pair in enumerate(zip(self.restart_files[:-1], self.restart_files[1:])):
+            if not self.with_halos[snap] or not self.with_halos[snap+1]:
+                continue
             self._find_likely_children(pair[0], pair[1])
-            self._compute_child_fraction(pair[0], pair[1])
+            # last is the data for the parent dataset, which can be supplied
+            # as the child from the previous round for all but the first loop.
+            last = self._compute_child_fraction(pair[0], pair[1], last)
+        del last
         if self.mine == 0 and index:
             mylog.info("Creating database index.")
             line = "CREATE INDEX IF NOT EXISTS HalosIndex ON Halos ("
@@ -140,7 +144,7 @@
             self.cursor.execute(line)
         self._barrier()
         self._close_database()
-        self._close_h5fp()
+        mylog.info("Done!")
         
     def _read_halo_lists(self):
         self.halo_lists = []
@@ -149,7 +153,7 @@
             self.halo_lists.append(hp.all_halos)
 
     def _run_halo_finder_add_to_db(self):
-        for file in self.restart_files:
+        for cycle, file in enumerate(self.restart_files):
             gc.collect()
             pf = lagos.EnzoStaticOutput(file)
             # If the halos are already found, skip this data step, unless
@@ -171,6 +175,10 @@
                 halos.write_out(os.path.join(dir, 'MergerHalos.out'))
                 halos.write_particle_lists(os.path.join(dir, 'MergerHalos'))
                 halos.write_particle_lists_txt(os.path.join(dir, 'MergerHalos'))
+                if len(halos) == 0:
+                    mylog.info("Dataset %s has no halos." % file)
+                    self.with_halos[cycle] = False
+                    continue
                 del halos
             # Now add halo data to the db if it isn't already there by
             # checking the first halo.
@@ -185,6 +193,10 @@
             # Read the halos off the disk using the Halo Profiler tools.
             hp = HP.HaloProfiler(file, halo_list_file='MergerHalos.out',
             halo_list_format={'id':0, 'mass':1, 'numpart':2, 'center':[7, 8, 9], 'velocity':[10, 11, 12], 'r_max':13})
+            if len(hp.all_halos) == 0:
+                mylog.info("Dataset %s has no halos." % file)
+                self.with_halos[cycle] = False
+                continue
             mylog.info("Entering halos into database for z=%f" % red)
             if self.mine == 0:
                 for ID,halo in enumerate(hp.all_halos):
@@ -252,7 +264,7 @@
                 # Create the table that will store the halo data.
                 line = "CREATE TABLE Halos (GlobalHaloID INTEGER PRIMARY KEY,\
                     SnapCurrentTimeIdentifier INTEGER, SnapZ FLOAT, SnapHaloID INTEGER, \
-                    DarkMatterMass FLOAT,\
+                    HaloMass FLOAT,\
                     NumPart INTEGER, CenMassX FLOAT, CenMassY FLOAT,\
                     CenMassZ FLOAT, BulkVelX FLOAT, BulkVelY FLOAT, BulkVelZ FLOAT,\
                     MaxRad FLOAT,\
@@ -284,15 +296,17 @@
         # Build the kdtree for the children by looping over the fetched rows.
         child_points = []
         for row in self.cursor:
-            p = Point()
-            p.data = [row[1], row[2], row[3]]
-            p.ID = row[0]
-            child_points.append(p)
-        child_kd = buildKdHyperRectTree(child_points[:],10)
-
-        # Make these just once.
-        neighbors = Neighbors()
-        neighbors.k = 5
+            child_points.append([row[1], row[2], row[3]])
+        # Turn it into fortran.
+        child_points = na.array(child_points)
+        fKD.pos = na.asfortranarray(child_points.T)
+        fKD.qv = na.empty(3, dtype='float64')
+        fKD.dist = na.empty(5, dtype='float64')
+        fKD.tags = na.empty(5, dtype='int64')
+        fKD.nn = 5
+        fKD.sort = True
+        fKD.rearrange = True
+        create_tree()
 
         # Find the parent points from the database.
         parent_pf = lagos.EnzoStaticOutput(parentfile)
@@ -305,13 +319,12 @@
         # parents.
         candidates = {}
         for row in self.cursor:
-            neighbors.points = []
-            neighbors.minDistanceSquared = 100. # should make this a function of the simulation
-            cm = [row[1], row[2], row[3]]
-            getKNN(cm, child_kd, neighbors, 0., [1.]*3)
+            fKD.qv = na.array([row[1], row[2], row[3]])
+            find_nn_nearest_neighbors()
+            NNtags = fKD.tags[:] - 1
             nIDs = []
-            for n in neighbors.points:
-                nIDs.append(n[1].ID)
+            for n in NNtags:
+                nIDs.append(n)
             if len(nIDs) < 5:
                 # We need to fill in fake halos if there aren't enough halos,
                 # which can happen at high redshifts.
@@ -319,13 +332,26 @@
                     nIDs.append(-1)
             candidates[row[0]] = nIDs
         
+        del fKD.pos, fKD.tags, fKD.dist
+        free_tree() # Frees the kdtree object.
+        
         self.candidates = candidates
+        
+        # This stores the masses contributed to each child candidate.
+        self.child_mass_arr = na.zeros(len(candidates)*5, dtype='float64')
+        # Records where to put the entries in the above array.
+        self.child_mass_loc = defaultdict(dict)
+        for i,halo in enumerate(sorted(candidates)):
+            for j, child in enumerate(candidates[halo]):
+                self.child_mass_loc[halo][child] = i*5 + j
 
     def _build_h5_refs(self, filename):
         # For this snapshot, add lists of file names that contain the
         # particle info for each halo.
         if not hasattr(self, 'h5files'):
             self.h5files = defaultdict(dict)
+        if not hasattr(self, 'names'):
+            self.names = defaultdict(set)
         file_pf = lagos.EnzoStaticOutput(filename)
         currt = file_pf['CurrentTimeIdentifier']
         dir = os.path.dirname(filename)
@@ -337,21 +363,10 @@
             line = line.strip().split()
             self.h5files[currt][i] = line[1:]
             names.update(line[1:])
+            self.names[currt].update(line[1:])
         lines.close()
-        # As long as we're here, open each of the h5 files and store the
-        # pointer in a persistent dict.
-        if not hasattr(self, 'h5fp'):
-            self.h5fp = defaultdict(dict)
-        for name in names:
-            self.h5fp[currt][name] = h5py.File(name)
 
-    def _close_h5fp(self):
-        # Cleanly close the open h5 file pointers.
-        for currt in self.h5fp:
-            for name in self.h5fp[currt]:
-                self.h5fp[currt][name].close()
-        
-    def _compute_child_fraction(self, parentfile, childfile):
+    def _compute_child_fraction(self, parentfile, childfile, last):
         # Given a parent and child snapshot, and a list of child candidates,
         # compute what fraction of the parent halo goes to each of the children.
         
@@ -363,90 +378,224 @@
         mylog.info("Computing fractional contribututions of particles to z=%1.5f halos." % \
             child_pf['CosmologyCurrentRedshift'])
         
-        child_percents = {}
-        for i,halo in enumerate(self.candidates):
-            if i%self.size != self.mine:
-                continue
-            # Read in its particle IDs
+        if last == None:
+            # First we're going to read in the particles, haloIDs and masses from
+            # the parent dataset.
+            parent_names = list(self.names[parent_currt])
+            parent_names.sort()
             parent_IDs = na.array([], dtype='int64')
             parent_masses = na.array([], dtype='float64')
-            for h5name in self.h5files[parent_currt][halo]:
-                # Get the correct time dict entry, and then the correct h5 file
-                # from that snapshot, and then choose this parent's halo
-                # group, and then the particle IDs. How's that for a long reach?
-                new_IDs = self.h5fp[parent_currt][h5name]['Halo%08d' % halo]['particle_index']
-                new_masses = self.h5fp[parent_currt][h5name]['Halo%08d' % halo]['ParticleMassMsun']
-                parent_IDs = na.concatenate((parent_IDs, new_IDs[:]))
-                parent_masses = na.concatenate((parent_masses, new_masses[:]))
-            # Loop over its children.
-            temp_percents = {}
-            for child in self.candidates[halo]:
-                if child == -1:
-                    # If this is a fake child, record zero contribution and move
-                    # on.
-                    temp_percents[-1] = 0.
+            parent_halos = na.array([], dtype='int32')
+            for i,pname in enumerate(parent_names):
+                if i>=self.mine and i%self.size==self.mine:
+                    h5fp = h5py.File(pname)
+                    for group in h5fp:
+                        gID = int(group[4:])
+                        thisIDs = h5fp[group]['particle_index'][:]
+                        thisMasses = h5fp[group]['ParticleMassMsun'][:]
+                        parent_IDs = na.concatenate((parent_IDs, thisIDs))
+                        parent_masses = na.concatenate((parent_masses, thisMasses))
+                        parent_halos = na.concatenate((parent_halos, 
+                            na.ones(thisIDs.size, dtype='int32') * gID))
+                    h5fp.close()
+            
+            # Sort the arrays by particle index in ascending order.
+            sort = parent_IDs.argsort()
+            parent_IDs = parent_IDs[sort]
+            parent_masses = parent_masses[sort]
+            parent_halos = parent_halos[sort]
+            del sort
+        else:
+            # We can use old data and save disk reading.
+            (parent_IDs, parent_masses, parent_halos) = last
+        # Used to communicate un-matched particles.
+        parent_send = na.ones(parent_IDs.size, dtype='bool')
+        
+        # Now get the child halo data.
+        child_names = list(self.names[child_currt])
+        child_names.sort()
+        child_IDs = na.array([], dtype='int64')
+        child_masses = na.array([], dtype='float64')
+        child_halos = na.array([], dtype='int32')
+        for i,cname in enumerate(child_names):
+            if i>=self.mine and i%self.size==self.mine:
+                h5fp = h5py.File(cname)
+                for group in h5fp:
+                    gID = int(group[4:])
+                    thisIDs = h5fp[group]['particle_index'][:]
+                    thisMasses = h5fp[group]['ParticleMassMsun'][:]
+                    child_IDs = na.concatenate((child_IDs, thisIDs))
+                    child_masses = na.concatenate((child_masses, thisMasses))
+                    child_halos = na.concatenate((child_halos, 
+                        na.ones(thisIDs.size, dtype='int32') * gID))
+                h5fp.close()
+        
+        # Sort the arrays by particle index.
+        sort = child_IDs.argsort()
+        child_IDs = child_IDs[sort]
+        child_masses = child_masses[sort]
+        child_halos = child_halos[sort]
+        child_send = na.ones(child_IDs.size, dtype='bool')
+        del sort
+        
+        # Parent IDs on the left, child IDs on the right. We skip down both
+        # columns matching IDs. If they are out of synch, the index(es) is/are
+        # advanced until they match up again.
+        left = 0
+        right = 0
+        while left < parent_IDs.size and right < child_IDs.size:
+            if parent_IDs[left] == child_IDs[right]: