cg

changeset 94:e460569c21d4
.
author: bshanks@bshanks-salk.dyndns.org
date: Tue Apr 21 17:35:00 2009 -0700 (16 years ago)
parents: 9f36acf8d9a8
children: a25a60a4bf43
files: grant.bib grant.html grant.odt grant.pdf grant.txt
--- a/grant.bib	Tue Apr 21 14:50:10 2009 -0700
+++ b/grant.bib	Tue Apr 21 17:35:00 2009 -0700
@@ -73,7 +73,7 @@
-	keywords = {atom,client-side wiki,interoperability,interwiki,middleware,webdav,wiki,wikiclient,wikigateway,wikirpcinterface,wiki xmlrpc},
+	keywords = {atom,client-side wiki,interoperability,interwiki,middleware,webdav,wiki,wiki xmlrpc,wikiclient,wikigateway,wikirpcinterface},
@@ -218,7 +218,7 @@
-	keywords = {atlas-based segmentation,automate robotic in situ hybridization image annotation,biological techniques,biological tissues,biology {computing,Brain,cell-cell} signaling,cell differentiation,cellular biophysics,cellular resolution,cluster analysis,data {mining,DNA} sequence database,functional genomics,gene expression pattern,genetics,image classification,image segmentation,mesh maps,pattern clustering,postnatal mouse brain,query interface,statistical analysis,tissue},
+	keywords = {atlas-based segmentation,automate robotic in situ hybridization image annotation,biological techniques,biological tissues,biology {computing,Brain,cell} differentiation,cell-cell signaling,cellular biophysics,cellular resolution,cluster analysis,data {mining,DNA} sequence database,functional genomics,gene expression pattern,genetics,image classification,image segmentation,mesh maps,pattern clustering,postnatal mouse brain,query interface,statistical analysis,tissue},
@@ -469,4 +469,36 @@
+},
+
+@article{paciorek_computational_2007,
+	title = {Computational techniques for spatial logistic regression with large data sets},
+	volume = {51},
+	issn = {0167-9473},
+	url = {http://www.sciencedirect.com/science/article/B6V8V-4MG6JWS-2/2/dfe5cd9c7ac7bc39d22ce45eebe303b8},
+	doi = {10.1016/j.csda.2006.11.008},
+	abstract = {In epidemiological research, outcomes are frequently non-normal, sample sizes may be large, and effect sizes are often small. To relate health outcomes to geographic risk factors, fast and powerful methods for fitting spatial models, particularly for non-normal data, are required. I focus on binary outcomes, with the risk surface a smooth function of space, but the development herein is relevant for non-normal data in general. I compare penalized likelihood {(PL)} models, including the penalized quasi-likelihood {(PQL)} approach, and Bayesian models based on fit, speed, and ease of implementation.
+A Bayesian model using a spectral basis {(SB)} representation of the spatial surface via the Fourier basis provides the best tradeoff of sensitivity and specificity in simulations, detecting real spatial features while limiting overfitting and being reasonably computationally efficient. One of the contributions of this work is further development of this underused representation. The {SB} model outperforms the {PL} methods, which are prone to overfitting, but is slower to fit and not as easily implemented. A Bayesian Markov random field model performs less well statistically than the {SB} model, but is very computationally efficient. We illustrate the methods on a real data set of cancer cases in Taiwan.
+The success of the {SB} with binary data and similar results with count data suggest that it may be generally useful in spatial models and more complicated hierarchical models.},
+	number = {8},
+	journal = {Computational Statistics \& Data Analysis},
+	author = {Christopher J. Paciorek},
+	month = may,
+	year = {2007},
+	keywords = {Bayesian {statistics,Disease} {mapping,Fourier} {basis,Generalized} linear mixed {model,Geostatistics,Risk} {surface,Spatial} {statistics,Spectral} basis},
+	pages = {3631--3653}
+},
+
+@article{hastie_gene_2000,
+	title = {{'Gene} shaving' as a method for identifying distinct sets of genes with similar expression patterns},
+	volume = {1},
+	issn = {1465-6906},
+	url = {http://genomebiology.com/2000/1/2/research/0003/},
+	doi = {10.1186/gb-2000-1-2-research0003},
+	abstract = {{BACKGROUND:Large} gene expression studies, such as those conducted using {DNA} arrays, often provide millions of different pieces of data. To address the problem of analyzing such data, we describe a statistical method, which we have called 'gene shaving'. The method identifies subsets of genes with coherent expression patterns and large variation across conditions. Gene shaving differs from hierarchical clustering and other widely used methods for analyzing gene expression studies in that genes may belong to more than one cluster, and the clustering may be supervised by an outcome measure. The technique can be 'unsupervised', that is, the genes and samples are treated as unlabeled, or partially or fully supervised by using known properties of the genes or samples to assist in finding meaningful {groupings.RESULTS:We} illustrate the use of the gene shaving method to analyze gene expression measurements made on samples from patients with diffuse large B-cell lymphoma. The method identifies a small cluster of genes whose expression is highly predictive of {survival.CONCLUSIONS:The} gene shaving method is a potentially useful tool for exploration of gene expression data and identification of interesting clusters of genes worth further investigation.},
+	number = {2},
+	journal = {Genome Biology},
+	author = {Trevor Hastie and Robert Tibshirani and Michael Eisen and Ash Alizadeh and Ronald Levy and Louis Staudt and Wing Chan and David Botstein and Patrick Brown},
+	year = {2000},
+	pages = {research0003.1--research0003.21}
--- a/grant.html	Tue Apr 21 14:50:10 2009 -0700
+++ b/grant.html	Tue Apr 21 17:35:00 2009 -0700
@@ -13,7 +13,8 @@
-space.
+space. In particular, our method could be applied to genome-wide sequencing data derived from sets of tissues and disease
+states.
@@ -29,21 +30,21 @@
-Now we will discuss each of our three aims in turn.  For each aim, we will develop a conceptual framework for thinking
-about the task, and we will present our strategy for solving it. Next we will discuss related work. At the conclusion of each
-section, we will summarize why our strategy is different from what has been done before. At the end of this section, we will
-describe the potential impact.
+Each of our three aims will be discussed in turn. For each aim, we will develop a conceptual framework for thinking about
+the task, and we will present our strategy for solving it. Next we will discuss related work. At the conclusion of each section,
+we will summarize why our strategy is different from what has been done before. At the end of this section, we will describe
+the potential impact.
-Machine learning terminology The task of looking for marker genes for known anatomical regions means that one is
-looking for a set of genes such that, if the expression level of those genes is known, then the locations of the regions can be
-inferred.
-If we define the regions so that they cover the entire anatomical structure to be divided, we may say that we are using
-gene expression to determine to which region each voxel within the structure belongs.  We call this a classification task,
-because each voxel is being assigned to a class (namely, its region).  An understanding of the relationship between the
-combination of their expression levels and the locations of the regions may be expressed as a function.  The input to this
-function is a voxel, along with the gene expression levels within that voxel; the output is the regional identity of the target
-voxel, that is, the region to which the target voxel belongs.  We call this function a classifier.  In general, the input to a
-classifier is called an instance, and the output is called a label (or a class label).
+Machine learning terminology:  classifiers The task of looking for marker genes for known anatomical regions means
+that one is looking for a set of genes such that, if the expression level of those genes is known, then the locations of the
+regions can be inferred.
+If we define the regions so that they cover the entire anatomical structure to be subdivided, we may say that we are
+using gene expression in each voxel to assign that voxel to the proper area. We call this a classification task, because each
+voxel is being assigned to a class (namely, its region).  An understanding of the relationship between the combination of
+their expression levels and the locations of the regions may be expressed as a function. The input to this function is a voxel,
+along with the gene expression levels within that voxel; the output is the regional identity of the target voxel, that is, the
+region to which the target voxel belongs. We call this function a classifier. In general, the input to a classifier is called an
+instance, and the output is called a label (or a class label).
@@ -63,17 +64,23 @@
+Both gene expression data and anatomical atlases have errors, due to a variety of factors.  Individual subjects have
+idiosyncratic anatomy.  Subjects may be improperly registred to the atlas.  The method used to measure gene expression
+may be noisy.  The atlas may have errors.  It is even possible that some areas in the anatomical atlas are &#8220;wrong&#8221; in that
+they do not have the same shape as the natural domains of gene expression to which they correspond. These sources of error
+can affect the displacement and the shape of both the gene expression data and the anatomical target areas. Therefore, it
+is important to use feature selection methods which are robust to these kinds of errors.
+_________________________________________
+   1Strictly speaking, the features are gene expression levels, but we&#8217;ll call them genes.
-It is too much to hope that every anatomical region of interest will be identified by a single gene.  For example, in the
+It istoo much to hope that every anatomical region of interest will be identified by a single gene.  For example, in the
-_______
-   1Strictly speaking, the features are gene expression levels, but we&#8217;ll call them genes.
@@ -107,35 +114,34 @@
-[11 ] mentions the possibility of constructing a spatial region for each gene, and then, for each anatomical structure of
+[12 ] mentions the possibility of constructing a spatial region for each gene, and then, for each anatomical structure of
-GeneAtlas[5] and EMAGE [23] allow the user to construct a search query by demarcating regions and then specifing
+GeneAtlas[5] and EMAGE [25] allow the user to construct a search query by demarcating regions and then specifing
-[13 ] describes AGEA, &#8221;Anatomic Gene Expression Atlas&#8221;.  AGEA has three components.  Gene Finder:  The user
+[14 ] describes AGEA, &#8221;Anatomic Gene Expression Atlas&#8221;.  AGEA has three components.  Gene Finder:  The user
+_________________________________________
+   2By &#8220;fundamentally spatial&#8221; we mean that there is information from a large number of spatial locations indexed by spatial coordinates; not
+just data which have only a few different locations or which is indexed by anatomical label.
+    3Actually, many of these projects use quadrilaterals instead of square pixels; but we will refer to them as pixels for simplicity.
+    4the number of true pixels in the intersection of the two images, divided by the number of pixels in their union.
-_________________________________________
-   2By &#8220;fundamentally spatial&#8221; we mean that there is information from a large number of spatial locations indexed by spatial coordinates; not
-just data which have only a few different locations or which is indexed by anatomical label.
-    3Actually, many of these projects use quadrilaterals instead of square pixels; but we will refer to them as pixels for simplicity.
-    4the number of true pixels in the intersection of the two images, divided by the number of pixels in their union.
-    5&#8220;Expression energy ratio&#8221;, which captures overexpression.
-[9 ] describes a technique to find combinations of marker genes to pick out an anatomical region. They use an evolutionary
+[10 ] describes a technique to find combinations of marker genes to pick out an anatomical region. They use an evolutionary
@@ -176,17 +182,18 @@
+_________________________________________
+   5&#8220;Expression energy ratio&#8221;, which captures overexpression.
+    6There are imaging tasks which use more than three colors, for example multispectral imaging and hyperspectral imaging, which are often
+used to process satellite imagery.
+    7First, because the number of features in the reduced dataset is less than in the original dataset, the running time of clustering algorithms
+may be much less. Second, it is thought that some clustering algorithms may give better results on reduced data.
-__
-   6There are imaging tasks which use more than three colors, for example multispectral imaging and hyperspectral imaging, which are often
-used to process satellite imagery.
-    7First, because the number of features in the reduced dataset is less than in the original dataset, the running time of clustering algorithms
-may be much less. Second, it is thought that some clustering algorithms may give better results on reduced data.
-patternwhich seems to pick out a single, spatially continguous region.  Therefore, it seems likely that an anatomically
+pattern which seems to pick out a single, spatially continguous region.  Therefore, it seems likely that an anatomically
@@ -194,20 +201,20 @@
-Some researchers have attempted to parcellate cortex on the basis of non-gene expression data. For example, [15], [2], [16],
-and [1 ] associate spots on the cortex with the radial profile9 of response to some stain ([10] uses MRI), extract features from
+Some researchers have attempted to parcellate cortex on the basis of non-gene expression data. For example, [17], [2], [18],
+and [1 ] associate spots on the cortex with the radial profile9 of response to some stain ([11] uses MRI), extract features from
-[20 ] describes an analysis of the anatomy of the hippocampus using the ABA dataset.  In addition to manual analysis,
+[22 ] describes an analysis of the anatomy of the hippocampus using the ABA dataset.  In addition to manual analysis,
-AGEA[13] includes a preset hierarchial clustering of voxels based on a recursive bifurcation algorithm with correlation
-as the similarity metric.  EMAGE[23] allows the user to select a dataset from among a large number of alternatives, or by
+AGEA[14] includes a preset hierarchial clustering of voxels based on a recursive bifurcation algorithm with correlation
+as the similarity metric.  EMAGE[25] allows the user to select a dataset from among a large number of alternatives, or by
@@ -217,10 +224,10 @@
-[9 ] applies their technique for finding combinations of marker genes for the purpose of clustering genes around a &#8220;seed
+[10 ] applies their technique for finding combinations of marker genes for the purpose of clustering genes around a &#8220;seed
-seed. The same team also describes a method[22] for finding &#8220;association rules&#8221; such as, &#8220;if this voxel is expressed in by
+seed. The same team also describes a method[24] for finding &#8220;association rules&#8221; such as, &#8220;if this voxel is expressed in by
@@ -228,10 +235,6 @@
-Aim 3: apply the methods developed to the cerebral cortex
-Background
-The cortex is divided into areas and layers. Because of the cortical columnar organization, the parcellation of the cortex
-into areas can be drawn as a 2-D map on the surface of the cortex.  In the third dimension, the boundaries between the
@@ -241,16 +244,20 @@
+Aim 3: apply the methods developed to the cerebral cortex
+Background
+The cortex is divided into areas and layers. Because of the cortical columnar organization, the parcellation of the cortex
+into areas can be drawn as a 2-D map on the surface of the cortex.  In the third dimension, the boundaries between the
-surface.One can picture an area of the cortex as a slice of a six-layered cake11.
+surface. One can picture an area of the cortex as a slice of a six-layered cake11.
-agreed-upon map can be seen by contrasting the recent maps given by Swanson[19] on the one hand, and Paxinos and
-Franklin[14] on the other.  While the maps are certainly very similar in their general arrangement, significant differences
+agreed-upon map can be seen by contrasting the recent maps given by Swanson[21] on the one hand, and Paxinos and
+Franklin[16] on the other.  While the maps are certainly very similar in their general arrangement, significant differences
@@ -260,8 +267,8 @@
-voxels in the 3D coordinate system, of which 51,533 are in the brain[13].
-Mus musculus is thought to contain about 22,000 protein-coding genes[25].  The ABA contains data on about 20,000
+voxels in the 3D coordinate system, of which 51,533 are in the brain[14].
+Mus musculus is thought to contain about 22,000 protein-coding genes[27].  The ABA contains data on about 20,000
@@ -269,7 +276,7 @@
-[13 ] describes the application of AGEA to the cortex. The paper describes interesting results on the structure of correlations
+[14 ] describes the application of AGEA to the cortex. The paper describes interesting results on the structure of correlations
@@ -278,20 +285,17 @@
-Our project is guided by a concrete application with a well-specified criterion of success (how well we can find marker
-genes for / reproduce the layout of cortical areas), which will provide a solid basis for comparing different methods.
-Significance
-________________________ 
-11Outside of isocortex, the number of layers varies.
-   12The sagittal data do not cover the entire cortex, and also have greater registration error[13].  Genes were selected by the Allen Institute for
+___________________
+  11Outside of isocortex, the number of layers varies.
+   12The sagittal data do not cover the entire cortex, and also have greater registration error[14].  Genes were selected by the Allen Institute for
-pattern&#8221;[13].
-   13Other such resources include GENSAT[8], GenePaint[24], its sister project GeneAtlas[5], BGEM[12], EMAGE[23], EurExpress (http://www.
-eurexpress.org/ee/; EurExpress data are also entered into EMAGE), EADHB (http://www.ncl.ac.uk/ihg/EADHB/database/EADHB_database.
-html),  MAMEP  (http://mamep.molgen.mpg.de/index.php),  Xenbase  (http://xenbase.org/),  ZFIN[18],  Aniseed  (http://aniseed-ibdm.
-univ-mrs.fr/), VisiGene (http://genome.ucsc.edu/cgi-bin/hgVisiGene ; includes data from some of the other listed data sources), GEISHA[4],
-Fruitfly.org[21], COMPARE (http://compare.ibdml.univ-mrs.fr/), GXD[17], GEO[3] (GXD and GEO contain spatial data but also non-spatial
-data. All GXD spatial data are also in EMAGE.)
+pattern&#8221;[14].
+   13Other  such  resources  include  GENSAT[8],  GenePaint[26],  its  sister  project  GeneAtlas[5],  BGEM[13],  EMAGE[25],  EurExpress  (http:
+//www.eurexpress.org/ee/;  EurExpress data are also entered into EMAGE), EADHB (http://www.ncl.ac.uk/ihg/EADHB/database/$EADHB_
+{database}$.html),   MAMEP  (http://mamep.molgen.mpg.de/index.php),   Xenbase  (http://xenbase.org/),   ZFIN[20],   Aniseed  (http://
+aniseed-ibdm.univ-mrs.fr/),  VisiGene  (http://genome.ucsc.edu/cgi-bin/hgVisiGene ;  includes  data  from  some  of  the  other  listed  data
+sources), GEISHA[4], Fruitfly.org[23], COMPARE (http://compare.ibdml.univ-mrs.fr/), GXD[19], GEO[3] (GXD and GEO contain spatial
+data but also non-spatial data. All GXD spatial data are also in EMAGE.)
@@ -299,7 +303,9 @@
-
+Our project is guided by a concrete application with a well-specified criterion of success (how well we can find marker
+genes for / reproduce the layout of cortical areas), which will provide a solid basis for comparing different methods.
+Significance
@@ -338,15 +344,15 @@
-                               modifications to the human cortical map as well.  In fact, the methods we
-                               will develop will be applicable to other datasets beyond the brain.  We will
-                               provide an open-source toolbox to allow other researchers to easily use our
-                               methods.  With these methods, researchers with gene expression for any area
-                               of the body will be able to efficiently find marker genes for anatomical regions,
-or to use gene expression to discover new anatomical patterning. As described above, marker genes have a variety of uses in
-the development of drugs and experimental manipulations, and in the anatomical characterization of tissue samples.  The
-discovery of new ways to carve up anatomical structures into regions may lead to the discovery of new anatomical subregions
-in various structures, which will widely impact all areas of biology.
+                               modifications to the human cortical map as well. In fact, the methods we will
+                               develop will be applicable to other datasets beyond the brain. We will provide
+                               an open-source toolbox to allow other researchers to easily use our methods.
+                               With these methods, researchers with gene expression for any area of the body
+                               will be able to efficiently find marker genes for anatomical regions, or to use
+                               gene expression to discover new anatomical patterning.  As described above,
+marker genes have a variety of uses in the development of drugs and experimental manipulations, and in the anatomical
+characterization of tissue samples.  The discovery of new ways to carve up anatomical structures into regions may lead to
+the discovery of new anatomical subregions in various structures, which will widely impact all areas of biology.
@@ -362,20 +368,12 @@
-Using Caret[7], we created a mesh representation of the surface of the selected voxels.  For each gene, for each node of
-the mesh, we calculated an average of the gene expression of the voxels &#8220;underneath&#8221; that mesh node.  We then flattened
+Using Caret[7], we created a mesh representation of the surface of the selected voxels. For each gene, and for each node
+of the mesh, we calculated an average of the gene expression of the voxels &#8220;underneath&#8221; that mesh node. We then flattened
-We sampled the nodes of the irregular, flat mesh in order to create a regular grid of pixel values. We converted this grid
-into a MATLAB matrix.
-We manually traced the boundaries of each of 49 cortical areas from the ABA coronal reference atlas slides.  We then
-converted these manual traces into Caret-format regional boundary data on the mesh surface.  We projected the regions
-onto the 2-d mesh, and then onto the grid, and then we converted the region data into MATLAB format.
-_________________________________________
+____
-At this point, the data are in the form of a number of 2-D matrices, all in registration, with the matrix entries representing
-a grid of points (pixels) over the cortical surface:
-&#x2219;A 2-D matrix whose entries represent the regional label associated with each surface pixel
-&#x2219;For each gene, a 2-D matrix whose entries represent the average expression level underneath each surface pixel
+
@@ -385,7 +383,21 @@
-Ptk7, and Aph1a.                          We created a normalized version of the gene expression data by subtracting
+Ptk7, and Aph1a.                          We sampled the nodes of the irregular, flat mesh in order to create a regular
+                               grid of pixel values. We converted this grid into a MATLAB matrix.
+                                  We manually traced the boundaries of each of 49 cortical areas from the
+                               ABA coronal reference atlas slides.  We then converted these manual traces
+                               into Caret-format regional boundary data on the mesh surface. We projected
+                               the regions onto the 2-d mesh, and then onto the grid, and then we converted
+                               the region data into MATLAB format.
+                                  At this point, the data are in the form of a number of 2-D matrices, all in
+                               registration, with the matrix entries representing a grid of points (pixels) over
+                               the cortical surface:
+                                  &#x2219; A 2-D matrix whose entries represent the regional label associated with
+                                    each surface pixel
+                                  &#x2219; For each gene, a 2-D matrix whose entries represent the average expres-
+                                    sion level underneath each surface pixel
+                                  We created a normalized version of the gene expression data by subtracting
@@ -393,31 +405,30 @@
-                               plan to create a separate matrix for each cortical layer to represent the average
-                               expression level within that layer. Cortical layers are found at different depths
-                               in different parts of the cortex. In preparation for extracting the layer-specific
-                               datasets, we have extended Caret with routines that allow the depth of the
-                               ROI for volume-to-surface projection to vary.
-                                  In the Research Plan, we describe how we will automatically locate the
-                               layer depths.  For validation, we have manually demarcated the depth of the
-                               outer boundary of cortical layer 5 throughout the cortex.
-                                Feature selection and scoring methods
-                               Underexpression of a gene can serve as a marker Underexpression of a
-                               gene can sometimes serve as a marker. See, for example, Figure 2.
-                                  Correlation Recall that the instances are surface pixels, and consider the
-                               problem of attempting to classify each instance as either a member of a partic-
-                               ular anatomical area, or not. The target area can be represented as a boolean
-                               mask over the surface pixels.
-One class of feature selection scoring methods contains methods which calculate some sort of &#8220;match&#8221; between each gene
-image and the target image. Those genes which match the best are good candidates for features.
-One of the simplest methods in this class is to use correlation as the match score. We calculated the correlation between
-each gene and each cortical area. The top row of Figure 1 shows the three genes most correlated with area SS.
+plan to create a separate matrix for each cortical layer to represent the average expression level within that layer. Cortical
+layers are found at different depths in different parts of the cortex. In preparation for extracting the layer-specific datasets,
+we have extended Caret with routines that allow the depth of the ROI for volume-to-surface projection to vary.
+In the Research Plan, we describe how we will automatically locate the layer depths. For validation, we have manually
+demarcated the depth of the outer boundary of cortical layer 5 throughout the cortex.
+Feature selection and scoring methods
+Underexpression of a gene can serve as a marker Underexpression of a gene can sometimes serve as a marker. See,
+for example, Figure 2.
-the corresponding pixels in the upper row).     Conditional entropy An information-theoretic scoring method is to find
+the corresponding pixels in the upper row).     Correlation Recall that the instances are surface pixels, and consider the
+                               problem of attempting to classify each instance as either a member of a partic-
+                               ular anatomical area, or not. The target area can be represented as a boolean
+                               mask over the surface pixels.
+                                  One class of feature selection scoring methods contains methods which cal-
+                               culate some sort of &#8220;match&#8221; between each gene image and the target image.
+                               Those genes which match the best are good candidates for features.
+                                  One of the simplest methods in this class is to use correlation as the match
+                               score. We calculated the correlation between each gene and each cortical area.
+                               The top row of Figure 1 shows the three genes most correlated with area SS.
+                                  Conditional entropy An information-theoretic scoring method is to find
@@ -425,20 +436,16 @@
-                               boolean masks of the gene data. For each gene, we created a boolean mask of
-                               its expression levels using each of these thresholds: the mean of that gene, the
-                               mean minus one standard deviation, the mean minus two standard deviations,
-                               the mean plus one standard deviation, the mean plus two standard deviations.
-                                  Now, for each region, we created and ran a forward stepwise procedure
-                               which attempted to find pairs of gene expression boolean masks such that the
-                               conditional entropy of the target area&#8217;s boolean mask, conditioned upon the
-                               pair of gene expression boolean masks, is minimized.
-                                  This finds pairs of genes which are most informative (at least at these dis-
-                               cretization thresholds) relative to the question, &#8220;Is this surface pixel a member
-                               of the target area?&#8221;. Its advantage over linear methods such as logistic regres-
-sion is that it takes account of arbitrarily nonlinear relationships; for example, if the XOR of two variables predicts the
-target, conditional entropy would notice, whereas linear methods would not.
-
+                               boolean masks of the gene data.  For each gene, we created a boolean mask
+of its expression levels using each of these thresholds: the mean of that gene, the mean minus one standard deviation, the
+mean minus two standard deviations, the mean plus one standard deviation, the mean plus two standard deviations.
+Now, for each region, we created and ran a forward stepwise procedure which attempted to find pairs of gene expression
+boolean masks such that the conditional entropy of the target area&#8217;s boolean mask, conditioned upon the pair of gene
+expression boolean masks, is minimized.
+This finds pairs of genes which are most informative (at least at these discretization thresholds) relative to the question,
+&#8220;Is this surface pixel a member of the target area?&#8221;. Its advantage over linear methods such as logistic regression is that it
+takes account of arbitrarily nonlinear relationships; for example, if the XOR of two variables predicts the target, conditional
+entropy would notice, whereas linear methods would not.
@@ -508,20 +515,20 @@
-larity, we have already found single genes which roughly identify some areas and groupings of areas. For each of these areas,
-an example of a gene which roughly identifies it is shown in Figure 5.  We have not yet cross-verified these genes in other
-atlases.
-In addition, there are a number of areas which are almost identified by single genes:  COAa+NLOT (anterior part of
-cortical amygdalar area, nucleus of the lateral olfactory tract), ENT (entorhinal), ACAv (ventral anterior cingulate), VIS
-(visual), AUD (auditory).
-These results validate our expectation that the ABA dataset can be exploited to find marker genes for many cortical
-areas, while also validating the relevancy of our new scoring method, gradient similarity.
+                               larity, we have already found single genes which roughly identify some areas
+and groupings of areas. For each of these areas, an example of a gene which roughly identifies it is shown in Figure 5. We
+have not yet cross-verified these genes in other atlases.
+In addition, there are a number of areas which are almost identified by single genes:  COAa+NLOT (anterior part of
+cortical amygdalar area, nucleus of the lateral olfactory tract), ENT (entorhinal), ACAv (ventral anterior cingulate), VIS
+(visual), AUD (auditory).
+These results validate our expectation that the ABA dataset can be exploited to find marker genes for many cortical
+areas, while also validating the relevancy of our new scoring method, gradient similarity.
@@ -533,10 +540,10 @@
-Feature selection integrated with prediction As noted earlier, in general, any predictive method can be used for
-feature selection by running it inside a stepwise wrapper. Also, some predictive methods integrate soft constraints on number
-of features used. Examples of both of these will be seen in the section &#8220;Multivariate Predictive methods&#8221;.
-Multivariate Predictive methods
+Feature selection integrated with prediction As noted earlier, in general, any classifier can be used for feature
+selection by running it inside a stepwise wrapper.  Also, some learning algorithms integrate soft constraints on number of
+features used. Examples of both of these will be seen in the section &#8220;Multivariate supervised learning&#8221;.
+Multivariate supervised learning
@@ -578,6 +585,8 @@
+_________________________________________
+  195-fold cross-validation.
@@ -585,8 +594,6 @@
-_________________________________________
-  195-fold cross-validation.
@@ -623,10 +630,12 @@
-as Student&#8217;s t-test, and the Mann-Whitney U test (a non-parametric test). In addition, any predictive procedure induces a
-scoring measure on genes by taking the prediction error when using that gene to predict the target.
+as Student&#8217;s t-test, and the Mann-Whitney U test (a non-parametric test).  In addition, any classifier induces a scoring
+measure on genes by taking the prediction error when using that gene to predict the target.
-for each cortical area, we will rank the genes by their ability to delineate each area.
+for each cortical area, we will rank the genes by their ability to delineate each area. We will quantitatively compare the list
+of single genes generated by our method to the lists generated by previous methods which are mentioned in Aim 1 Related
+Work.
@@ -635,33 +644,35 @@
-wrapper over &#8220;vanilla&#8221; predictive methods such as logistic regression, (b) predictive methods such as decision trees which
-incrementally/greedily combine single gene markers into sets, and (c) predictive methods which use soft constraints to
-minimize number of features used, such as sparse support vector machines.
-todo
-Some of these methods, such as the Hough transform, are designed to be resistant to registration error and error in the
-anatomical map.
-We will also consider extensions to scoring measures that may improve their robustness to registration error and to
-error in the anatomical map; for example, a wrapper that runs a scoring method on small displacements and distortions
-of the data adds robustness to registration error at the expense of computation time.  It is possible that some areas in the
-anatomical map do not correspond to natural domains of gene expression.
-# Extend the procedure to handle difficult areas by combining or redrawing the boundaries: An area may be difficult to
-identify because the boundaries are misdrawn, or because it does not &#8220;really&#8221; exist as a single area, at least on the genetic
-level. We will develop extensions to our procedure which (a) detect when a difficult area could be fit if its boundary were
-redrawn slightly, and (b) detect when a difficult area could be combined with adjacent areas to create a larger area which
-can be fit.
+wrapper over &#8220;vanilla&#8221; classifiers such as logistic regression, (b) supervised learning methods such as decision trees which
+incrementally/greedily combine single gene markers into sets, and (c) supervised learning methods which use soft constraints
+to minimize number of features used, such as sparse support vector machines.
+Since errors of displacement and of shape may cause genes and target areas to match less than they should, we will
+consider the robustness of feature selection methods in the presence of error.  Some of these methods, such as the Hough
+transform, are designed to be resistant in the presence of error, but many are not.  We will consider extensions to scoring
+measures that may improve their robustness; for example, a wrapper that runs a scoring method on small displacements
+and distortions of the data adds robustness to registration error at the expense of computation time.
+An area may be difficult to identify because the boundaries are misdrawn in the atlas, or because the shape of the natural
+domain of gene expression corresponding to the area is different from the shape of the area as recognized by anatomists.
+We will extend our procedure to handle difficult areas by combining areas or redrawing their boundaries.  We will develop
+extensions to our procedure which (a) detect when a difficult area could be fit if its boundary were redrawn slightly, and (b)
+detect when a difficult area could be combined with adjacent areas to create a larger area which can be fit.
-Decision trees todo
-20.
+Classifiers
+We will explore and compare different classifiers.  As noted above, this activity is not separate from the previous one,
+because some supervised learning algorithms include feature selection, and any classifier can be combined with a stepwise
+wrapper for use as a feature selection method.  We will explore logistic regression (including spatial models[15]), decision
+trees20 , sparse SVMs, generative mixture models (including naive bayes), kernel density estimation, genetic algorithms, and
+artificial neural networks.
+Decision trees
-# mixture models, etc
-4.Explore clustering algorithms applied to genes: including gene shaving, TODO
+4.Explore clustering algorithms applied to genes: including gene shaving[9], TODO
@@ -677,6 +688,10 @@
+_________________________________________
+  20Actually, we have already begun to explore decision trees. For each cortical area, we have used the C4.5 algorithm to find a decision tree for
+that area.  We achieved good classification accuracy on our training set, but the number of genes that appeared in each tree was too large.  We
+plan to implement a pruning procedure to generate trees that use fewer genes.
@@ -686,10 +701,6 @@
-_________________________________________
-  20Already, for each cortical area, we have used the C4.5 algorithm to find a decision tree for that area. We achieved good classification accuracy
-on our training set, but the number of genes that appeared in each tree was too large.  We plan to implement a pruning procedure to generate
-trees that use fewer genes
@@ -736,52 +747,57 @@
-[9]Jano Hemert and Richard Baldock. Matching Spatial Regions with Combinations of Interacting Gene Expression Pat-
+[9]Trevor Hastie, Robert Tibshirani, Michael Eisen, Ash Alizadeh, Ronald Levy, Louis Staudt, Wing Chan, David Botstein,
+and Patrick Brown. &#8217;Gene shaving&#8217; as a method for identifying distinct sets of genes with similar expression patterns.
+Genome Biology, 1(2):research0003.1&#8211;research0003.21, 2000.
+[10]Jano Hemert and Richard Baldock. Matching Spatial Regions with Combinations of Interacting Gene Expression Pat-
-[10]F. Kruggel, M. K. Brckner, Th. Arendt, C. J. Wiggins, and D. Y. von Cramon. Analyzing the neocortical fine-structure.
+[11]F. Kruggel, M. K. Brckner, Th. Arendt, C. J. Wiggins, and D. Y. von Cramon. Analyzing the neocortical fine-structure.
-[11]Erh-Fang Lee, Jyl Boline, and Arthur W. Toga. A High-Resolution anatomical framework of the neonatal mouse brain
+[12]Erh-Fang Lee, Jyl Boline, and Arthur W. Toga. A High-Resolution anatomical framework of the neonatal mouse brain
-[12]Susan Magdaleno, Patricia Jensen, Craig L. Brumwell, Anna Seal, Karen Lehman, Andrew Asbury, Tony Cheung,
+[13]Susan Magdaleno, Patricia Jensen, Craig L. Brumwell, Anna Seal, Karen Lehman, Andrew Asbury, Tony Cheung,
-[13]Lydia Ng, Amy Bernard, Chris Lau, Caroline C Overly, Hong-Wei Dong, Chihchau Kuan, Sayan Pathak, Susan M
+[14]Lydia Ng, Amy Bernard, Chris Lau, Caroline C Overly, Hong-Wei Dong, Chihchau Kuan, Sayan Pathak, Susan M
-[14]George Paxinos and Keith B.J. Franklin. The Mouse Brain in Stereotaxic Coordinates. Academic Press, 2 edition, July
+[15]Christopher J. Paciorek.  Computational techniques for spatial logistic regression with large data sets.  Computational
+Statistics &amp; Data Analysis, 51(8):3631&#8211;3653, May 2007.
+[16]George Paxinos and Keith B.J. Franklin. The Mouse Brain in Stereotaxic Coordinates. Academic Press, 2 edition, July
-[15]A. Schleicher, N. Palomero-Gallagher, P. Morosan, S. Eickhoff, T. Kowalski, K. Vos, K. Amunts, and K. Zilles. Quanti-
+[17]A. Schleicher, N. Palomero-Gallagher, P. Morosan, S. Eickhoff, T. Kowalski, K. Vos, K. Amunts, and K. Zilles. Quanti-
-[16]Oliver Schmitt, Lars Hmke, and Lutz Dmbgen.  Detection of cortical transition regions utilizing statistical analyses of
+[18]Oliver Schmitt, Lars Hmke, and Lutz Dmbgen.  Detection of cortical transition regions utilizing statistical analyses of
-[17]Constance M. Smith, Jacqueline H. Finger, Terry F. Hayamizu, Ingeborg J. McCright, Janan T. Eppig, James A.
+[19]Constance M. Smith, Jacqueline H. Finger, Terry F. Hayamizu, Ingeborg J. McCright, Janan T. Eppig, James A.
-[18]Judy Sprague, Leyla Bayraktaroglu, Dave Clements, Tom Conlin, David Fashena, Ken Frazer, Melissa Haendel, Dou-
+[20]Judy Sprague, Leyla Bayraktaroglu, Dave Clements, Tom Conlin, David Fashena, Ken Frazer, Melissa Haendel, Dou-
-[19]Larry Swanson. Brain Maps: Structure of the Rat Brain. Academic Press, 3 edition, November 2003.
-[20]Carol L. Thompson, Sayan D. Pathak, Andreas Jeromin, Lydia L. Ng, Cameron R. MacPherson, Marty T. Mortrud,
+[21]Larry Swanson. Brain Maps: Structure of the Rat Brain. Academic Press, 3 edition, November 2003.
+[22]Carol L. Thompson, Sayan D. Pathak, Andreas Jeromin, Lydia L. Ng, Cameron R. MacPherson, Marty T. Mortrud,
-[21]Pavel  Tomancak,  Amy  Beaton,  Richard  Weiszmann,  Elaine  Kwan,  ShengQiang  Shu,  Suzanna  E  Lewis,  Stephen
+[23]Pavel  Tomancak,  Amy  Beaton,  Richard  Weiszmann,  Elaine  Kwan,  ShengQiang  Shu,  Suzanna  E  Lewis,  Stephen
-[22]Jano van Hemert and Richard Baldock. Mining Spatial Gene Expression Data for Association Rules, volume 4414/2007
+[24]Jano van Hemert and Richard Baldock. Mining Spatial Gene Expression Data for Association Rules, volume 4414/2007
-[23]Shanmugasundaram Venkataraman, Peter Stevenson, Yiya Yang, Lorna Richardson, Nicholas Burton, Thomas P. Perry,
+[25]Shanmugasundaram Venkataraman, Peter Stevenson, Yiya Yang, Lorna Richardson, Nicholas Burton, Thomas P. Perry,
-[24]Axel Visel, Christina Thaller, and Gregor Eichele.  GenePaint.org:  an atlas of gene expression patterns in the mouse
+[26]Axel Visel, Christina Thaller, and Gregor Eichele.  GenePaint.org:  an atlas of gene expression patterns in the mouse
-[25]Robert H Waterston, Kerstin Lindblad-Toh, Ewan Birney, Jane Rogers, Josep F Abril, Pankaj Agarwal, Richa Agar-
+[27]Robert H Waterston, Kerstin Lindblad-Toh, Ewan Birney, Jane Rogers, Josep F Abril, Pankaj Agarwal, Richa Agar-
--- a/grant.txt	Tue Apr 21 14:50:10 2009 -0700
+++ b/grant.txt	Tue Apr 21 17:35:00 2009 -0700
@@ -14,7 +14,7 @@
-Although our particular application involves the 3D spatial distribution of gene expression, we anticipate that the methods developed in aims (1) and (2) will generalize to any sort of high-dimensional data over points located in a low-dimensional space.
+Although our particular application involves the 3D spatial distribution of gene expression, we anticipate that the methods developed in aims (1) and (2) will generalize to any sort of high-dimensional data over points located in a low-dimensional space. In particular, our method could be applied to genome-wide sequencing data derived from sets of tissues and disease states.
@@ -29,11 +29,11 @@
-Now we will discuss each of our three aims in turn. For each aim, we will develop a conceptual framework for thinking about the task, and we will present our strategy for solving it. Next we will discuss related work. At the conclusion of each section, we will summarize why our strategy is different from what has been done before. At the end of this section, we will describe the potential impact.
+Each of our three aims will be discussed in turn. For each aim, we will develop a conceptual framework for thinking about the task, and we will present our strategy for solving it. Next we will discuss related work. At the conclusion of each section, we will summarize why our strategy is different from what has been done before. At the end of this section, we will describe the potential impact.
-\vspace{0.3cm}**Machine learning terminology** The task of looking for marker genes for known anatomical regions means that one is looking for a set of genes such that, if the expression level of those genes is known, then the locations of the regions can be inferred. 
+\vspace{0.3cm}**Machine learning terminology: classifiers** The task of looking for marker genes for known anatomical regions means that one is looking for a set of genes such that, if the expression level of those genes is known, then the locations of the regions can be inferred. 
@@ -41,7 +41,7 @@
-If we define the regions so that they cover the entire anatomical structure to be divided, we may say that we are using gene expression to determine to which region each voxel within the structure belongs. We call this a __classification task__, because each voxel is being assigned to a class (namely, its region). An understanding of the relationship between the combination of their expression levels and the locations of the regions may be expressed as a function. The input to this function is a voxel, along with the gene expression levels within that voxel; the output is the regional identity of the target voxel, that is, the region to which the target voxel belongs. We call this function a __classifier__. In general, the input to a classifier is called an __instance__, and the output is called a __label__ (or a __class label__).
+If we define the regions so that they cover the entire anatomical structure to be subdivided, we may say that we are using gene expression in each voxel to assign that voxel to the proper area. We call this a __classification task__, because each voxel is being assigned to a class (namely, its region). An understanding of the relationship between the combination of their expression levels and the locations of the regions may be expressed as a function. The input to this function is a voxel, along with the gene expression levels within that voxel; the output is the regional identity of the target voxel, that is, the region to which the target voxel belongs. We call this function a __classifier__. In general, the input to a classifier is called an __instance__, and the output is called a __label__ (or a __class label__).
@@ -53,6 +53,9 @@
+Both gene expression data and anatomical atlases have errors, due to a variety of factors. Individual subjects have idiosyncratic anatomy. Subjects may be improperly registred to the atlas. The method used to measure gene expression may be noisy. The atlas may have errors. It is even possible that some areas in the anatomical atlas are "wrong" in that they do not have the same shape as the natural domains of gene expression to which they correspond. These sources of error can affect the displacement and the shape of both the gene expression data and the anatomical target areas. Therefore, it is important to use feature selection methods which are robust to these kinds of errors.
+
+
@@ -290,19 +293,7 @@
-Using Caret\cite{van_essen_integrated_2001}, we created a mesh representation of the surface of the selected voxels. For each gene, for each node of the mesh, we calculated an average of the gene expression of the voxels "underneath" that mesh node. We then flattened the cortex, creating a two-dimensional mesh. 
-
-We sampled the nodes of the irregular, flat mesh in order to create a regular grid of pixel values. We converted this grid into a MATLAB matrix.
-
-We manually traced the boundaries of each of 49 cortical areas from the ABA coronal reference atlas slides. We then converted these manual traces into Caret-format regional boundary data on the mesh surface. We projected the regions onto the 2-d mesh, and then onto the grid, and then we converted the region data into MATLAB format.
-
-At this point, the data are in the form of a number of 2-D matrices, all in registration, with the matrix entries representing a grid of points (pixels) over the cortical surface:
-
-
-
-* A 2-D matrix whose entries represent the regional label associated with each surface pixel
-* For each gene, a 2-D matrix whose entries represent the average expression level underneath each surface pixel 
-
+Using Caret\cite{van_essen_integrated_2001}, we created a mesh representation of the surface of the selected voxels. For each gene, and for each node of the mesh, we calculated an average of the gene expression of the voxels "underneath" that mesh node. We then flattened the cortex, creating a two-dimensional mesh. 
@@ -316,6 +307,19 @@
+We sampled the nodes of the irregular, flat mesh in order to create a regular grid of pixel values. We converted this grid into a MATLAB matrix.
+
+We manually traced the boundaries of each of 49 cortical areas from the ABA coronal reference atlas slides. We then converted these manual traces into Caret-format regional boundary data on the mesh surface. We projected the regions onto the 2-d mesh, and then onto the grid, and then we converted the region data into MATLAB format.
+
+At this point, the data are in the form of a number of 2-D matrices, all in registration, with the matrix entries representing a grid of points (pixels) over the cortical surface:
+
+
+
+* A 2-D matrix whose entries represent the regional label associated with each surface pixel
+* For each gene, a 2-D matrix whose entries represent the average expression level underneath each surface pixel 
+
+
+
@@ -339,15 +343,6 @@
-\vspace{0.3cm}**Correlation**
-Recall that the instances are surface pixels, and consider the problem of attempting to classify each instance as either a member of a particular anatomical area, or not. The target area can be represented as a boolean mask over the surface pixels. 
-
-One class of feature selection scoring methods contains methods which calculate some sort of "match" between each gene image and the target image. Those genes which match the best are good candidates for features.
-
-One of the simplest methods in this class is to use correlation as the match score. We calculated the correlation between each gene and each cortical area. The top row of Figure \ref{SScorrLr} shows the three genes most correlated with area SS.
-
-
-
@@ -355,6 +350,15 @@
+\vspace{0.3cm}**Correlation**
+Recall that the instances are surface pixels, and consider the problem of attempting to classify each instance as either a member of a particular anatomical area, or not. The target area can be represented as a boolean mask over the surface pixels. 
+
+One class of feature selection scoring methods contains methods which calculate some sort of "match" between each gene image and the target image. Those genes which match the best are good candidates for features.
+
+One of the simplest methods in this class is to use correlation as the match score. We calculated the correlation between each gene and each cortical area. The top row of Figure \ref{SScorrLr} shows the three genes most correlated with area SS.
+
+
+
@@ -424,10 +428,10 @@
-As noted earlier, in general, any predictive method can be used for feature selection by running it inside a stepwise wrapper. Also, some predictive methods integrate soft constraints on number of features used. Examples of both of these will be seen in the section "Multivariate Predictive methods".
-
-
-=== Multivariate Predictive methods ===
+As noted earlier, in general, any classifier can be used for feature selection by running it inside a stepwise wrapper. Also, some learning algorithms integrate soft constraints on number of features used. Examples of both of these will be seen in the section "Multivariate supervised learning".
+
+
+=== Multivariate supervised learning ===
@@ -504,47 +508,39 @@
-We will develop scoring methods for evaluating how good individual genes are at marking areas. We will compare pointwise, geometric, and information-theoretic measures. We already developed one entirely new scoring method (gradient similarity), but we may develop more. Scoring measures that we will explore will include the L1 norm, correlation, expression energy ratio, conditional entropy, gradient similarity, Jaccard similarity, Dice similarity, Hough transform, and statistical tests such as Student's t-test, and the Mann-Whitney U test (a non-parametric test). In addition, any predictive procedure induces a scoring measure on genes by taking the prediction error when using that gene to predict the target. 
-
-
-
-Using some combination of these measures, we will develop a procedure to find single marker genes for anatomical regions: for each cortical area, we will rank the genes by their ability to delineate each area.
+We will develop scoring methods for evaluating how good individual genes are at marking areas. We will compare pointwise, geometric, and information-theoretic measures. We already developed one entirely new scoring method (gradient similarity), but we may develop more. Scoring measures that we will explore will include the L1 norm, correlation, expression energy ratio, conditional entropy, gradient similarity, Jaccard similarity, Dice similarity, Hough transform, and statistical tests such as Student's t-test, and the Mann-Whitney U test (a non-parametric test). In addition, any classifier induces a scoring measure on genes by taking the prediction error when using that gene to predict the target. 
+
+Using some combination of these measures, we will develop a procedure to find single marker genes for anatomical regions: for each cortical area, we will rank the genes by their ability to delineate each area. We will quantitatively compare the list of single genes generated by our method to the lists generated by previous methods which are mentioned in Aim 1 Related Work.
+
-We will develop a feature selection procedure for choosing the best small set of marker genes for a given anatomical area. In addition to using the scoring measures that we develop, we will also explore (a) feature selection using a stepwise wrapper over "vanilla" predictive methods such as logistic regression, (b) predictive methods such as decision trees which incrementally/greedily combine single gene markers into sets, and (c) predictive methods which use soft constraints to minimize number of features used, such as sparse support vector machines. 
-
-todo
-
-Some of these methods, such as the Hough transform, are designed to be resistant to registration error and error in the anatomical map. 
-
-We will also consider extensions to scoring measures that may improve their robustness to registration error and to error in the anatomical map; for example, a wrapper that runs a scoring method on small displacements and distortions of the data adds robustness to registration error at the expense of computation time. It is possible that some areas in the anatomical map do not correspond to natural domains of gene expression.
-
-# Extend the procedure to handle difficult areas by combining or redrawing the boundaries: An area may be difficult to identify because the boundaries are misdrawn, or because it does not "really" exist as a single area, at least on the genetic level. We will develop extensions to our procedure which (a) detect when a difficult area could be fit if its boundary were redrawn slightly, and (b) detect when a difficult area could be combined with adjacent areas to create a larger area which can be fit.
-
+We will develop a feature selection procedure for choosing the best small set of marker genes for a given anatomical area. In addition to using the scoring measures that we develop, we will also explore (a) feature selection using a stepwise wrapper over "vanilla" classifiers such as logistic regression, (b) supervised learning methods such as decision trees which incrementally/greedily combine single gene markers into sets, and (c) supervised learning methods which use soft constraints to minimize number of features used, such as sparse support vector machines. 
+
+Since errors of displacement and of shape may cause genes and target areas to match less than they should, we will consider the robustness of feature selection methods in the presence of error. Some of these methods, such as the Hough transform, are designed to be resistant in the presence of error, but many are not. We will consider extensions to scoring measures that may improve their robustness; for example, a wrapper that runs a scoring method on small displacements and distortions of the data adds robustness to registration error at the expense of computation time. 
+
+An area may be difficult to identify because the boundaries are misdrawn in the atlas, or because the shape of the natural domain of gene expression corresponding to the area is different from the shape of the area as recognized by anatomists. We will extend our procedure to handle difficult areas by combining areas or redrawing their boundaries. We will develop extensions to our procedure which (a) detect when a difficult area could be fit if its boundary were redrawn slightly, and (b) detect when a difficult area could be combined with adjacent areas to create a larger area which can be fit.
+\vspace{0.3cm}**Classifiers**
+
+We will explore and compare different classifiers. As noted above, this activity is not separate from the previous one, because some supervised learning algorithms include feature selection, and any classifier can be combined with a stepwise wrapper for use as a feature selection method. We will explore logistic regression (including spatial models\cite{paciorek_computational_2007}), decision trees\footnote{Actually, we have already begun to explore decision trees. For each cortical area, we have used the C4.5 algorithm to find a decision tree for that area. We achieved good classification accuracy on our training set, but the number of genes that appeared in each tree was too large. We plan to implement a pruning procedure to generate trees that use fewer genes.}, sparse SVMs, generative mixture models (including naive bayes), kernel density estimation, genetic algorithms, and artificial neural networks.
+
-todo
-
-\footnote{Already, for each cortical area, we have used the C4.5 algorithm to find a decision tree for that area. We achieved good classification accuracy on our training set, but the number of genes that appeared in each tree was too large. We plan to implement a pruning procedure to generate trees that use fewer genes}.
+
-# mixture models, etc
-
-
-
-# Explore clustering algorithms applied to genes: including gene shaving, TODO
+# Explore clustering algorithms applied to genes: including gene shaving\cite{hastie_gene_2000}, TODO
author	bshanks@bshanks-salk.dyndns.org
date	Tue Apr 21 17:35:00 2009 -0700 (16 years ago)
parents	9f36acf8d9a8
children	a25a60a4bf43
files	grant.bib grant.html grant.odt grant.pdf grant.txt