cg

changeset 94:e460569c21d4

.
author bshanks@bshanks-salk.dyndns.org
date Tue Apr 21 17:35:00 2009 -0700 (16 years ago)
parents 9f36acf8d9a8
children a25a60a4bf43
files grant.bib grant.html grant.odt grant.pdf grant.txt
line diff
1.1 --- a/grant.bib Tue Apr 21 14:50:10 2009 -0700 1.2 +++ b/grant.bib Tue Apr 21 17:35:00 2009 -0700 1.3 @@ -73,7 +73,7 @@ 1.4 publisher = {{ACM}}, 1.5 author = {Bayle Shanks}, 1.6 year = {2005}, 1.7 - keywords = {atom,client-side wiki,interoperability,interwiki,middleware,webdav,wiki,wikiclient,wikigateway,wikirpcinterface,wiki xmlrpc}, 1.8 + keywords = {atom,client-side wiki,interoperability,interwiki,middleware,webdav,wiki,wiki xmlrpc,wikiclient,wikigateway,wikirpcinterface}, 1.9 pages = {53--66} 1.10 }, 1.11 1.12 @@ -218,7 +218,7 @@ 1.13 booktitle = {Computational Systems Bioinformatics Conference, 2005. Workshops and Poster Abstracts. {IEEE}}, 1.14 author = {J. Carson and T. Ju and C. Thaller and M. Bello and I. Kakadiaris and J. Warren and G. Eichele and W. Chiu}, 1.15 year = {2005}, 1.16 - keywords = {atlas-based segmentation,automate robotic in situ hybridization image annotation,biological techniques,biological tissues,biology {computing,Brain,cell-cell} signaling,cell differentiation,cellular biophysics,cellular resolution,cluster analysis,data {mining,DNA} sequence database,functional genomics,gene expression pattern,genetics,image classification,image segmentation,mesh maps,pattern clustering,postnatal mouse brain,query interface,statistical analysis,tissue}, 1.17 + keywords = {atlas-based segmentation,automate robotic in situ hybridization image annotation,biological techniques,biological tissues,biology {computing,Brain,cell} differentiation,cell-cell signaling,cellular biophysics,cellular resolution,cluster analysis,data {mining,DNA} sequence database,functional genomics,gene expression pattern,genetics,image classification,image segmentation,mesh maps,pattern clustering,postnatal mouse brain,query interface,statistical analysis,tissue}, 1.18 pages = {358} 1.19 }, 1.20 1.21 @@ -469,4 +469,36 @@ 1.22 author = {Chris Adamson and Leigh Johnston and Terrie Inder and Sandra Rees and Iven Mareels and Gary Egan}, 1.23 year = {2005}, 1.24 pages = {294--301} 1.25 +}, 1.26 + 1.27 +@article{paciorek_computational_2007, 1.28 + title = {Computational techniques for spatial logistic regression with large data sets}, 1.29 + volume = {51}, 1.30 + issn = {0167-9473}, 1.31 + url = {http://www.sciencedirect.com/science/article/B6V8V-4MG6JWS-2/2/dfe5cd9c7ac7bc39d22ce45eebe303b8}, 1.32 + doi = {10.1016/j.csda.2006.11.008}, 1.33 + abstract = {In epidemiological research, outcomes are frequently non-normal, sample sizes may be large, and effect sizes are often small. To relate health outcomes to geographic risk factors, fast and powerful methods for fitting spatial models, particularly for non-normal data, are required. I focus on binary outcomes, with the risk surface a smooth function of space, but the development herein is relevant for non-normal data in general. I compare penalized likelihood {(PL)} models, including the penalized quasi-likelihood {(PQL)} approach, and Bayesian models based on fit, speed, and ease of implementation. 1.34 +A Bayesian model using a spectral basis {(SB)} representation of the spatial surface via the Fourier basis provides the best tradeoff of sensitivity and specificity in simulations, detecting real spatial features while limiting overfitting and being reasonably computationally efficient. One of the contributions of this work is further development of this underused representation. The {SB} model outperforms the {PL} methods, which are prone to overfitting, but is slower to fit and not as easily implemented. A Bayesian Markov random field model performs less well statistically than the {SB} model, but is very computationally efficient. We illustrate the methods on a real data set of cancer cases in Taiwan. 1.35 +The success of the {SB} with binary data and similar results with count data suggest that it may be generally useful in spatial models and more complicated hierarchical models.}, 1.36 + number = {8}, 1.37 + journal = {Computational Statistics \& Data Analysis}, 1.38 + author = {Christopher J. Paciorek}, 1.39 + month = may, 1.40 + year = {2007}, 1.41 + keywords = {Bayesian {statistics,Disease} {mapping,Fourier} {basis,Generalized} linear mixed {model,Geostatistics,Risk} {surface,Spatial} {statistics,Spectral} basis}, 1.42 + pages = {3631--3653} 1.43 +}, 1.44 + 1.45 +@article{hastie_gene_2000, 1.46 + title = {{'Gene} shaving' as a method for identifying distinct sets of genes with similar expression patterns}, 1.47 + volume = {1}, 1.48 + issn = {1465-6906}, 1.49 + url = {http://genomebiology.com/2000/1/2/research/0003/}, 1.50 + doi = {10.1186/gb-2000-1-2-research0003}, 1.51 + abstract = {{BACKGROUND:Large} gene expression studies, such as those conducted using {DNA} arrays, often provide millions of different pieces of data. To address the problem of analyzing such data, we describe a statistical method, which we have called 'gene shaving'. The method identifies subsets of genes with coherent expression patterns and large variation across conditions. Gene shaving differs from hierarchical clustering and other widely used methods for analyzing gene expression studies in that genes may belong to more than one cluster, and the clustering may be supervised by an outcome measure. The technique can be 'unsupervised', that is, the genes and samples are treated as unlabeled, or partially or fully supervised by using known properties of the genes or samples to assist in finding meaningful {groupings.RESULTS:We} illustrate the use of the gene shaving method to analyze gene expression measurements made on samples from patients with diffuse large B-cell lymphoma. The method identifies a small cluster of genes whose expression is highly predictive of {survival.CONCLUSIONS:The} gene shaving method is a potentially useful tool for exploration of gene expression data and identification of interesting clusters of genes worth further investigation.}, 1.52 + number = {2}, 1.53 + journal = {Genome Biology}, 1.54 + author = {Trevor Hastie and Robert Tibshirani and Michael Eisen and Ash Alizadeh and Ronald Levy and Louis Staudt and Wing Chan and David Botstein and Patrick Brown}, 1.55 + year = {2000}, 1.56 + pages = {research0003.1--research0003.21} 1.57 } 1.58 \ No newline at end of file
2.1 --- a/grant.html Tue Apr 21 14:50:10 2009 -0700 2.2 +++ b/grant.html Tue Apr 21 17:35:00 2009 -0700 2.3 @@ -13,7 +13,8 @@ 2.4 Caret, an existing open-source scientific imaging program. Use this dataset to validate the methods developed in (1) and (2). 2.5 Although our particular application involves the 3D spatial distribution of gene expression, we anticipate that the methods 2.6 developed in aims (1) and (2) will generalize to any sort of high-dimensional data over points located in a low-dimensional 2.7 -space. 2.8 +space. In particular, our method could be applied to genome-wide sequencing data derived from sets of tissues and disease 2.9 +states. 2.10 In terms of the application of the methods to cerebral cortex, aim (1) is to go from cortical areas to marker genes, 2.11 and aim (2) is to let the gene profile define the cortical areas. In addition to validating the usefulness of the algorithms, 2.12 the application of these methods to cortex will produce immediate benefits, because there are currently no known genetic 2.13 @@ -29,21 +30,21 @@ 2.14 gene expression to anatomy. We want to find marker genes for specific anatomical regions, and also to draw new anatomical 2.15 maps based on gene expression patterns. 2.16 The Challenge and Potential impact 2.17 -Now we will discuss each of our three aims in turn. For each aim, we will develop a conceptual framework for thinking 2.18 -about the task, and we will present our strategy for solving it. Next we will discuss related work. At the conclusion of each 2.19 -section, we will summarize why our strategy is different from what has been done before. At the end of this section, we will 2.20 -describe the potential impact. 2.21 +Each of our three aims will be discussed in turn. For each aim, we will develop a conceptual framework for thinking about 2.22 +the task, and we will present our strategy for solving it. Next we will discuss related work. At the conclusion of each section, 2.23 +we will summarize why our strategy is different from what has been done before. At the end of this section, we will describe 2.24 +the potential impact. 2.25 Aim 1: Given a map of regions, find genes that mark the regions 2.26 -Machine learning terminology The task of looking for marker genes for known anatomical regions means that one is 2.27 -looking for a set of genes such that, if the expression level of those genes is known, then the locations of the regions can be 2.28 -inferred. 2.29 -If we define the regions so that they cover the entire anatomical structure to be divided, we may say that we are using 2.30 -gene expression to determine to which region each voxel within the structure belongs. We call this a classification task, 2.31 -because each voxel is being assigned to a class (namely, its region). An understanding of the relationship between the 2.32 -combination of their expression levels and the locations of the regions may be expressed as a function. The input to this 2.33 -function is a voxel, along with the gene expression levels within that voxel; the output is the regional identity of the target 2.34 -voxel, that is, the region to which the target voxel belongs. We call this function a classifier. In general, the input to a 2.35 -classifier is called an instance, and the output is called a label (or a class label). 2.36 +Machine learning terminology: classifiers The task of looking for marker genes for known anatomical regions means 2.37 +that one is looking for a set of genes such that, if the expression level of those genes is known, then the locations of the 2.38 +regions can be inferred. 2.39 +If we define the regions so that they cover the entire anatomical structure to be subdivided, we may say that we are 2.40 +using gene expression in each voxel to assign that voxel to the proper area. We call this a classification task, because each 2.41 +voxel is being assigned to a class (namely, its region). An understanding of the relationship between the combination of 2.42 +their expression levels and the locations of the regions may be expressed as a function. The input to this function is a voxel, 2.43 +along with the gene expression levels within that voxel; the output is the regional identity of the target voxel, that is, the 2.44 +region to which the target voxel belongs. We call this function a classifier. In general, the input to a classifier is called an 2.45 +instance, and the output is called a label (or a class label). 2.46 The object of aim 1 is not to produce a single classifier, but rather to develop an automated method for determining a 2.47 classifier for any known anatomical structure. Therefore, we seek a procedure by which a gene expression dataset may be 2.48 analyzed in concert with an anatomical atlas in order to produce a classifier. The initial gene expression dataset used in 2.49 @@ -63,17 +64,23 @@ 2.50 and then aggregating these sub-scores into a final score (the aggregation is often a sum or a sum of squares or average). If 2.51 only information from nearby voxels is used to calculate a voxel’s sub-score, then we say it is a local scoring method. If only 2.52 information from the voxel itself is used to calculate a voxel’s sub-score, then we say it is a pointwise scoring method. 2.53 +Both gene expression data and anatomical atlases have errors, due to a variety of factors. Individual subjects have 2.54 +idiosyncratic anatomy. Subjects may be improperly registred to the atlas. The method used to measure gene expression 2.55 +may be noisy. The atlas may have errors. It is even possible that some areas in the anatomical atlas are “wrong” in that 2.56 +they do not have the same shape as the natural domains of gene expression to which they correspond. These sources of error 2.57 +can affect the displacement and the shape of both the gene expression data and the anatomical target areas. Therefore, it 2.58 +is important to use feature selection methods which are robust to these kinds of errors. 2.59 Our strategy for Aim 1 2.60 Key questions when choosing a learning method are: What are the instances? What are the features? How are the features 2.61 chosen? Here are four principles that outline our answers to these questions. 2.62 +_________________________________________ 2.63 + 1Strictly speaking, the features are gene expression levels, but we’ll call them genes. 2.64 Principle 1: Combinatorial gene expression 2.65 -It is too much to hope that every anatomical region of interest will be identified by a single gene. For example, in the 2.66 +It istoo much to hope that every anatomical region of interest will be identified by a single gene. For example, in the 2.67 cortex, there are some areas which are not clearly delineated by any gene included in the Allen Brain Atlas (ABA) dataset. 2.68 However, at least some of these areas can be delineated by looking at combinations of genes (an example of an area for 2.69 which multiple genes are necessary and sufficient is provided in Preliminary Studies, Figure 4). Therefore, each instance 2.70 should contain multiple features (genes). 2.71 -_______ 2.72 - 1Strictly speaking, the features are gene expression levels, but we’ll call them genes. 2.73 Principle 2: Only look at combinations of small numbers of genes 2.74 When the classifier classifies a voxel, it is only allowed to look at the expression of the genes which have been selected 2.75 as features. The more data that are available to a classifier, the better that it can do. For example, perhaps there are weak 2.76 @@ -107,35 +114,34 @@ 2.77 we believe that domain-specific scoring measures (such as gradient similarity, which is discussed in Preliminary Studies) may 2.78 be necessary in order to achieve the best results in this application. 2.79 We are aware of six existing efforts to find marker genes using spatial gene expression data using automated methods. 2.80 -[11 ] mentions the possibility of constructing a spatial region for each gene, and then, for each anatomical structure of 2.81 +[12 ] mentions the possibility of constructing a spatial region for each gene, and then, for each anatomical structure of 2.82 interest, computing what proportion of this structure is covered by the gene’s spatial region. 2.83 -GeneAtlas[5] and EMAGE [23] allow the user to construct a search query by demarcating regions and then specifing 2.84 +GeneAtlas[5] and EMAGE [25] allow the user to construct a search query by demarcating regions and then specifing 2.85 either the strength of expression or the name of another gene or dataset whose expression pattern is to be matched. For the 2.86 similiarity score (match score) between two images (in this case, the query and the gene expression images), GeneAtlas uses 2.87 the sum of a weighted L1-norm distance between vectors whose components represent the number of cells within a pixel3 2.88 whose expression is within four discretization levels. EMAGE uses Jaccard similarity4. Neither GeneAtlas nor EMAGE 2.89 allow one to search for combinations of genes that define a region in concert but not separately. 2.90 -[13 ] describes AGEA, ”Anatomic Gene Expression Atlas”. AGEA has three components. Gene Finder: The user 2.91 +[14 ] describes AGEA, ”Anatomic Gene Expression Atlas”. AGEA has three components. Gene Finder: The user 2.92 selects a seed voxel and the system (1) chooses a cluster which includes the seed voxel, (2) yields a list of genes which are 2.93 overexpressed in that cluster. (note: the ABA website also contains pre-prepared lists of overexpressed genes for selected 2.94 structures). Correlation: The user selects a seed voxel and the system then shows the user how much correlation there is 2.95 between the gene expression profile of the seed voxel and every other voxel. Clusters: will be described later 2.96 +_________________________________________ 2.97 + 2By “fundamentally spatial” we mean that there is information from a large number of spatial locations indexed by spatial coordinates; not 2.98 +just data which have only a few different locations or which is indexed by anatomical label. 2.99 + 3Actually, many of these projects use quadrilaterals instead of square pixels; but we will refer to them as pixels for simplicity. 2.100 + 4the number of true pixels in the intersection of the two images, divided by the number of pixels in their union. 2.101 Gene Finder is different from our Aim 1 in at least three ways. First, Gene Finder finds only single genes, whereas we 2.102 will also look for combinations of genes. Second, gene finder can only use overexpression as a marker, whereas we will also 2.103 search for underexpression. Third, Gene Finder uses a simple pointwise score5, whereas we will also use geometric scores 2.104 such as gradient similarity (described in Preliminary Studies). Figures 4, 2, and 3 in the Preliminary Studies section contains 2.105 evidence that each of our three choices is the right one. 2.106 -_________________________________________ 2.107 - 2By “fundamentally spatial” we mean that there is information from a large number of spatial locations indexed by spatial coordinates; not 2.108 -just data which have only a few different locations or which is indexed by anatomical label. 2.109 - 3Actually, many of these projects use quadrilaterals instead of square pixels; but we will refer to them as pixels for simplicity. 2.110 - 4the number of true pixels in the intersection of the two images, divided by the number of pixels in their union. 2.111 - 5“Expression energy ratio”, which captures overexpression. 2.112 [6 ] looks at the mean expression level of genes within anatomical regions, and applies a Student’s t-test with Bonferroni 2.113 correction to determine whether the mean expression level of a gene is significantly higher in the target region. Like AGEA, 2.114 this is a pointwise measure (only the mean expression level per pixel is being analyzed), it is not being used to look for 2.115 underexpression, and does not look for combinations of genes. 2.116 -[9 ] describes a technique to find combinations of marker genes to pick out an anatomical region. They use an evolutionary 2.117 +[10 ] describes a technique to find combinations of marker genes to pick out an anatomical region. They use an evolutionary 2.118 algorithm to evolve logical operators which combine boolean (thresholded) images in order to match a target image. Their 2.119 match score is Jaccard similarity. 2.120 In summary, there has been fruitful work on finding marker genes, but only one of the previous projects explores 2.121 @@ -176,17 +182,18 @@ 2.122 extraction or dimensionality reduction. The small set of features that such a technique yields is called the reduced feature 2.123 set. Note that the features in the reduced feature set do not necessarily correspond to genes; each feature in the reduced set 2.124 may be any function of the set of gene expression levels. 2.125 +_________________________________________ 2.126 + 5“Expression energy ratio”, which captures overexpression. 2.127 + 6There are imaging tasks which use more than three colors, for example multispectral imaging and hyperspectral imaging, which are often 2.128 +used to process satellite imagery. 2.129 + 7First, because the number of features in the reduced dataset is less than in the original dataset, the running time of clustering algorithms 2.130 +may be much less. Second, it is thought that some clustering algorithms may give better results on reduced data. 2.131 Clustering genes rather than voxels Although the ultimate goal is to cluster the instances (voxels or pixels), one 2.132 strategy to achieve this goal is to first cluster the features (genes). There are two ways that clusters of genes could be used. 2.133 Gene clusters could be used as part of dimensionality reduction: rather than have one feature for each gene, we could 2.134 have one reduced feature for each gene cluster. 2.135 -__ 2.136 - 6There are imaging tasks which use more than three colors, for example multispectral imaging and hyperspectral imaging, which are often 2.137 -used to process satellite imagery. 2.138 - 7First, because the number of features in the reduced dataset is less than in the original dataset, the running time of clustering algorithms 2.139 -may be much less. Second, it is thought that some clustering algorithms may give better results on reduced data. 2.140 Gene clusters could also be used to directly yield a clustering on instances. This is because many genes have an expression 2.141 -patternwhich seems to pick out a single, spatially continguous region. Therefore, it seems likely that an anatomically 2.142 +pattern which seems to pick out a single, spatially continguous region. Therefore, it seems likely that an anatomically 2.143 interesting region will have multiple genes which each individually pick it out8. This suggests the following procedure: 2.144 cluster together genes which pick out similar regions, and then to use the more popular common regions as the final clusters. 2.145 In Preliminary Studies, Figure 7, we show that a number of anatomically recognized cortical regions, as well as some 2.146 @@ -194,20 +201,20 @@ 2.147 The task of clustering both the instances and the features is called co-clustering, and there are a number of co-clustering 2.148 algorithms. 2.149 Related work 2.150 -Some researchers have attempted to parcellate cortex on the basis of non-gene expression data. For example, [15], [2], [16], 2.151 -and [1 ] associate spots on the cortex with the radial profile9 of response to some stain ([10] uses MRI), extract features from 2.152 +Some researchers have attempted to parcellate cortex on the basis of non-gene expression data. For example, [17], [2], [18], 2.153 +and [1 ] associate spots on the cortex with the radial profile9 of response to some stain ([11] uses MRI), extract features from 2.154 this profile, and then use similarity between surface pixels to cluster. Features used include statistical moments, wavelets, 2.155 and the excess mass functional. Some of these features are motivated by the presence of tangential lines of stain intensity 2.156 which correspond to laminar structure. Some methods use standard clustering procedures, whereas others make use of the 2.157 spatial nature of the data to look for sudden transitions, which are identified as areal borders. 2.158 -[20 ] describes an analysis of the anatomy of the hippocampus using the ABA dataset. In addition to manual analysis, 2.159 +[22 ] describes an analysis of the anatomy of the hippocampus using the ABA dataset. In addition to manual analysis, 2.160 two clustering methods were employed, a modified Non-negative Matrix Factorization (NNMF), and a hierarchial recursive 2.161 bifurcation clustering scheme based on correlation as the similarity score. The paper yielded impressive results, proving 2.162 the usefulness of computational genomic anatomy. We have run NNMF on the cortical dataset10 and while the results are 2.163 promising, they also demonstrate that NNMF is not necessarily the best dimensionality reduction method for this application 2.164 (see Preliminary Studies, Figure 6). 2.165 -AGEA[13] includes a preset hierarchial clustering of voxels based on a recursive bifurcation algorithm with correlation 2.166 -as the similarity metric. EMAGE[23] allows the user to select a dataset from among a large number of alternatives, or by 2.167 +AGEA[14] includes a preset hierarchial clustering of voxels based on a recursive bifurcation algorithm with correlation 2.168 +as the similarity metric. EMAGE[25] allows the user to select a dataset from among a large number of alternatives, or by 2.169 running a search query, and then to cluster the genes within that dataset. EMAGE clusters via hierarchial complete linkage 2.170 clustering with un-centred correlation as the similarity score. 2.171 [6 ] clustered genes, starting out by selecting 135 genes out of 20,000 which had high variance over voxels and which were 2.172 @@ -217,10 +224,10 @@ 2.173 similarity using a least squares metric”. The resulting matrix showed four clusters. For each cluster, prototypical spatial 2.174 expression patterns were created by averaging the genes in the cluster. The prototypes were analyzed manually, without 2.175 clustering voxels. 2.176 -[9 ] applies their technique for finding combinations of marker genes for the purpose of clustering genes around a “seed 2.177 +[10 ] applies their technique for finding combinations of marker genes for the purpose of clustering genes around a “seed 2.178 gene”. They do this by using the pattern of expression of the seed gene as the target image, and then searching for other 2.179 genes which can be combined to reproduce this pattern. Other genes which are found are considered to be related to the 2.180 -seed. The same team also describes a method[22] for finding “association rules” such as, “if this voxel is expressed in by 2.181 +seed. The same team also describes a method[24] for finding “association rules” such as, “if this voxel is expressed in by 2.182 any gene, then that voxel is probably also expressed in by the same gene”. This could be useful as part of a procedure for 2.183 clustering voxels. 2.184 In summary, although these projects obtained clusterings, there has not been much comparison between different algo- 2.185 @@ -228,10 +235,6 @@ 2.186 projects using gene expression on cortex did not attempt to make use of the radial profile of gene expression. Also, none of 2.187 these projects did a separate dimensionality reduction step before clustering pixels, none tried to cluster genes first in order 2.188 to guide automated clustering of pixels into spatial regions, and none used co-clustering algorithms. 2.189 -Aim 3: apply the methods developed to the cerebral cortex 2.190 -Background 2.191 -The cortex is divided into areas and layers. Because of the cortical columnar organization, the parcellation of the cortex 2.192 -into areas can be drawn as a 2-D map on the surface of the cortex. In the third dimension, the boundaries between the 2.193 _________________________________________ 2.194 8This would seem to contradict our finding in aim 1 that some cortical areas are combinatorially coded by multiple genes. However, it is 2.195 possible that the currently accepted cortical maps divide the cortex into regions which are unnatural from the point of view of gene expression; 2.196 @@ -241,16 +244,20 @@ 2.197 10We ran “vanilla” NNMF, whereas the paper under discussion used a modified method. Their main modification consisted of adding a soft 2.198 spatial contiguity constraint. However, on our dataset, NNMF naturally produced spatially contiguous clusters, so no additional constraint was 2.199 needed. The paper under discussion also mentions that they tried a hierarchial variant of NNMF, which we have not yet tried. 2.200 +Aim 3: apply the methods developed to the cerebral cortex 2.201 +Background 2.202 +The cortex is divided into areas and layers. Because of the cortical columnar organization, the parcellation of the cortex 2.203 +into areas can be drawn as a 2-D map on the surface of the cortex. In the third dimension, the boundaries between the 2.204 areas continue downwards into the cortical depth, perpendicular to the surface. The layer boundaries run parallel to the 2.205 -surface.One can picture an area of the cortex as a slice of a six-layered cake11. 2.206 +surface. One can picture an area of the cortex as a slice of a six-layered cake11. 2.207 It is known that different cortical areas have distinct roles in both normal functioning and in disease processes, yet there 2.208 are no known marker genes for most cortical areas. When it is necessary to divide a tissue sample into cortical areas, this is 2.209 a manual process that requires a skilled human to combine multiple visual cues and interpret them in the context of their 2.210 approximate location upon the cortical surface. 2.211 Even the questions of how many areas should be recognized in cortex, and what their arrangement is, are still not 2.212 completely settled. A proposed division of the cortex into areas is called a cortical map. In the rodent, the lack of a single 2.213 -agreed-upon map can be seen by contrasting the recent maps given by Swanson[19] on the one hand, and Paxinos and 2.214 -Franklin[14] on the other. While the maps are certainly very similar in their general arrangement, significant differences 2.215 +agreed-upon map can be seen by contrasting the recent maps given by Swanson[21] on the one hand, and Paxinos and 2.216 +Franklin[16] on the other. While the maps are certainly very similar in their general arrangement, significant differences 2.217 remain. 2.218 The Allen Mouse Brain Atlas dataset 2.219 The Allen Mouse Brain Atlas (ABA) data were produced by doing in-situ hybridization on slices of male, 56-day-old 2.220 @@ -260,8 +267,8 @@ 2.221 brains were needed in order to measure the expression of many genes. 2.222 An automated nonlinear alignment procedure located the 2D data from the various slices in a single 3D coordinate 2.223 system. In the final 3D coordinate system, voxels are cubes with 200 microns on a side. There are 67x41x58 = 159,326 2.224 -voxels in the 3D coordinate system, of which 51,533 are in the brain[13]. 2.225 -Mus musculus is thought to contain about 22,000 protein-coding genes[25]. The ABA contains data on about 20,000 2.226 +voxels in the 3D coordinate system, of which 51,533 are in the brain[14]. 2.227 +Mus musculus is thought to contain about 22,000 protein-coding genes[27]. The ABA contains data on about 20,000 2.228 genes in sagittal sections, out of which over 4,000 genes are also measured in coronal sections. Our dataset is derived from 2.229 only the coronal subset of the ABA12. 2.230 The ABA is not the only large public spatial gene expression dataset13. With the exception of the ABA, GenePaint, and 2.231 @@ -269,7 +276,7 @@ 2.232 the results into a single 3-D space, and to our knowledge only ABA and EMAGE make this form of data available for public 2.233 download from the website14. Many of these resources focus on developmental gene expression. 2.234 Related work 2.235 -[13 ] describes the application of AGEA to the cortex. The paper describes interesting results on the structure of correlations 2.236 +[14 ] describes the application of AGEA to the cortex. The paper describes interesting results on the structure of correlations 2.237 between voxel gene expression profiles within a handful of cortical areas. However, this sort of analysis is not related to either 2.238 of our aims, as it neither finds marker genes, nor does it suggest a cortical map based on gene expression data. Neither of 2.239 the other components of AGEA can be applied to cortical areas; AGEA’s Gene Finder cannot be used to find marker genes 2.240 @@ -278,20 +285,17 @@ 2.241 been almost no comparison of different algorithms or scoring methods, and (c) there has been no work on computationally 2.242 finding marker genes for cortical areas, or on finding a hierarchial clustering that will yield a map of cortical areas de novo 2.243 from gene expression data. 2.244 -Our project is guided by a concrete application with a well-specified criterion of success (how well we can find marker 2.245 -genes for / reproduce the layout of cortical areas), which will provide a solid basis for comparing different methods. 2.246 -Significance 2.247 -________________________ 2.248 -11Outside of isocortex, the number of layers varies. 2.249 - 12The sagittal data do not cover the entire cortex, and also have greater registration error[13]. Genes were selected by the Allen Institute for 2.250 +___________________ 2.251 + 11Outside of isocortex, the number of layers varies. 2.252 + 12The sagittal data do not cover the entire cortex, and also have greater registration error[14]. Genes were selected by the Allen Institute for 2.253 coronal sectioning based on, “classes of known neuroscientific interest... or through post hoc identification of a marked non-ubiquitous expression 2.254 -pattern”[13]. 2.255 - 13Other such resources include GENSAT[8], GenePaint[24], its sister project GeneAtlas[5], BGEM[12], EMAGE[23], EurExpress (http://www. 2.256 -eurexpress.org/ee/; EurExpress data are also entered into EMAGE), EADHB (http://www.ncl.ac.uk/ihg/EADHB/database/EADHB_database. 2.257 -html), MAMEP (http://mamep.molgen.mpg.de/index.php), Xenbase (http://xenbase.org/), ZFIN[18], Aniseed (http://aniseed-ibdm. 2.258 -univ-mrs.fr/), VisiGene (http://genome.ucsc.edu/cgi-bin/hgVisiGene ; includes data from some of the other listed data sources), GEISHA[4], 2.259 -Fruitfly.org[21], COMPARE (http://compare.ibdml.univ-mrs.fr/), GXD[17], GEO[3] (GXD and GEO contain spatial data but also non-spatial 2.260 -data. All GXD spatial data are also in EMAGE.) 2.261 +pattern”[14]. 2.262 + 13Other such resources include GENSAT[8], GenePaint[26], its sister project GeneAtlas[5], BGEM[13], EMAGE[25], EurExpress (http: 2.263 +//www.eurexpress.org/ee/; EurExpress data are also entered into EMAGE), EADHB (http://www.ncl.ac.uk/ihg/EADHB/database/$EADHB_ 2.264 +{database}$.html), MAMEP (http://mamep.molgen.mpg.de/index.php), Xenbase (http://xenbase.org/), ZFIN[20], Aniseed (http:// 2.265 +aniseed-ibdm.univ-mrs.fr/), VisiGene (http://genome.ucsc.edu/cgi-bin/hgVisiGene ; includes data from some of the other listed data 2.266 +sources), GEISHA[4], Fruitfly.org[23], COMPARE (http://compare.ibdml.univ-mrs.fr/), GXD[19], GEO[3] (GXD and GEO contain spatial 2.267 +data but also non-spatial data. All GXD spatial data are also in EMAGE.) 2.268 14without prior offline registration 2.269 15In both cases, the cause is that pairwise correlations between the gene expression of voxels in different areas but the same layer are often stronger 2.270 than pairwise correlations between the gene expression of voxels in different layers but the same area. Therefore, a pairwise voxel correlation 2.271 @@ -299,7 +303,9 @@ 2.272 intersection of a layer and an area, but since one area will have many layer-area intersection clusters, further work is needed to make sense of 2.273 these). The reason that Gene Finder cannot the find marker genes for cortical areas is that, although the user chooses a seed voxel, Gene Finder 2.274 chooses the ROI for which genes will be found, and it creates that ROI by (pairwise voxel correlation) clustering around the seed. 2.275 - 2.276 +Our project is guided by a concrete application with a well-specified criterion of success (how well we can find marker 2.277 +genes for / reproduce the layout of cortical areas), which will provide a solid basis for comparing different methods. 2.278 +Significance 2.279 2.280 2.281 Figure 1: Top row: Genes Nfic and 2.282 @@ -338,15 +344,15 @@ 2.283 expression. 2.284 While we do not here propose to analyze human gene expression data, it is 2.285 conceivable that the methods we propose to develop could be used to suggest 2.286 - modifications to the human cortical map as well. In fact, the methods we 2.287 - will develop will be applicable to other datasets beyond the brain. We will 2.288 - provide an open-source toolbox to allow other researchers to easily use our 2.289 - methods. With these methods, researchers with gene expression for any area 2.290 - of the body will be able to efficiently find marker genes for anatomical regions, 2.291 -or to use gene expression to discover new anatomical patterning. As described above, marker genes have a variety of uses in 2.292 -the development of drugs and experimental manipulations, and in the anatomical characterization of tissue samples. The 2.293 -discovery of new ways to carve up anatomical structures into regions may lead to the discovery of new anatomical subregions 2.294 -in various structures, which will widely impact all areas of biology. 2.295 + modifications to the human cortical map as well. In fact, the methods we will 2.296 + develop will be applicable to other datasets beyond the brain. We will provide 2.297 + an open-source toolbox to allow other researchers to easily use our methods. 2.298 + With these methods, researchers with gene expression for any area of the body 2.299 + will be able to efficiently find marker genes for anatomical regions, or to use 2.300 + gene expression to discover new anatomical patterning. As described above, 2.301 +marker genes have a variety of uses in the development of drugs and experimental manipulations, and in the anatomical 2.302 +characterization of tissue samples. The discovery of new ways to carve up anatomical structures into regions may lead to 2.303 +the discovery of new anatomical subregions in various structures, which will widely impact all areas of biology. 2.304 2.305 Figure 2: Gene Pitx2 2.306 is selectively underex- 2.307 @@ -362,20 +368,12 @@ 2.308 Flatmap of cortex 2.309 We downloaded the ABA data and applied a mask to select only those voxels which belong to 2.310 cerebral cortex. We divided the cortex into hemispheres. 2.311 -Using Caret[7], we created a mesh representation of the surface of the selected voxels. For each gene, for each node of 2.312 -the mesh, we calculated an average of the gene expression of the voxels “underneath” that mesh node. We then flattened 2.313 +Using Caret[7], we created a mesh representation of the surface of the selected voxels. For each gene, and for each node 2.314 +of the mesh, we calculated an average of the gene expression of the voxels “underneath” that mesh node. We then flattened 2.315 the cortex, creating a two-dimensional mesh. 2.316 -We sampled the nodes of the irregular, flat mesh in order to create a regular grid of pixel values. We converted this grid 2.317 -into a MATLAB matrix. 2.318 -We manually traced the boundaries of each of 49 cortical areas from the ABA coronal reference atlas slides. We then 2.319 -converted these manual traces into Caret-format regional boundary data on the mesh surface. We projected the regions 2.320 -onto the 2-d mesh, and then onto the grid, and then we converted the region data into MATLAB format. 2.321 -_________________________________________ 2.322 +____ 2.323 16SEV is a sparse format for spatial data. It is the format in which the ABA data is made available. 2.324 -At this point, the data are in the form of a number of 2-D matrices, all in registration, with the matrix entries representing 2.325 -a grid of points (pixels) over the cortical surface: 2.326 -∙A 2-D matrix whose entries represent the regional label associated with each surface pixel 2.327 -∙For each gene, a 2-D matrix whose entries represent the average expression level underneath each surface pixel 2.328 + 2.329 2.330 2.331 Figure 3: The top row shows the two genes 2.332 @@ -385,7 +383,21 @@ 2.333 vidually) best match area AUD, according 2.334 to gradient similarity. From left to right and 2.335 top to bottom, the genes are Ssr1, Efcbp1, 2.336 -Ptk7, and Aph1a. We created a normalized version of the gene expression data by subtracting 2.337 +Ptk7, and Aph1a. We sampled the nodes of the irregular, flat mesh in order to create a regular 2.338 + grid of pixel values. We converted this grid into a MATLAB matrix. 2.339 + We manually traced the boundaries of each of 49 cortical areas from the 2.340 + ABA coronal reference atlas slides. We then converted these manual traces 2.341 + into Caret-format regional boundary data on the mesh surface. We projected 2.342 + the regions onto the 2-d mesh, and then onto the grid, and then we converted 2.343 + the region data into MATLAB format. 2.344 + At this point, the data are in the form of a number of 2-D matrices, all in 2.345 + registration, with the matrix entries representing a grid of points (pixels) over 2.346 + the cortical surface: 2.347 + ∙ A 2-D matrix whose entries represent the regional label associated with 2.348 + each surface pixel 2.349 + ∙ For each gene, a 2-D matrix whose entries represent the average expres- 2.350 + sion level underneath each surface pixel 2.351 + We created a normalized version of the gene expression data by subtracting 2.352 each gene’s mean expression level (over all surface pixels) and dividing the 2.353 expression level of each gene by its standard deviation. 2.354 The features and the target area are both functions on the surface pix- 2.355 @@ -393,31 +405,30 @@ 2.356 alternately, they can be thought of as images which can be displayed on the 2.357 flatmapped surface. 2.358 To move beyond a single average expression level for each surface pixel, we 2.359 - plan to create a separate matrix for each cortical layer to represent the average 2.360 - expression level within that layer. Cortical layers are found at different depths 2.361 - in different parts of the cortex. In preparation for extracting the layer-specific 2.362 - datasets, we have extended Caret with routines that allow the depth of the 2.363 - ROI for volume-to-surface projection to vary. 2.364 - In the Research Plan, we describe how we will automatically locate the 2.365 - layer depths. For validation, we have manually demarcated the depth of the 2.366 - outer boundary of cortical layer 5 throughout the cortex. 2.367 - Feature selection and scoring methods 2.368 - Underexpression of a gene can serve as a marker Underexpression of a 2.369 - gene can sometimes serve as a marker. See, for example, Figure 2. 2.370 - Correlation Recall that the instances are surface pixels, and consider the 2.371 - problem of attempting to classify each instance as either a member of a partic- 2.372 - ular anatomical area, or not. The target area can be represented as a boolean 2.373 - mask over the surface pixels. 2.374 -One class of feature selection scoring methods contains methods which calculate some sort of “match” between each gene 2.375 -image and the target image. Those genes which match the best are good candidates for features. 2.376 -One of the simplest methods in this class is to use correlation as the match score. We calculated the correlation between 2.377 -each gene and each cortical area. The top row of Figure 1 shows the three genes most correlated with area SS. 2.378 +plan to create a separate matrix for each cortical layer to represent the average expression level within that layer. Cortical 2.379 +layers are found at different depths in different parts of the cortex. In preparation for extracting the layer-specific datasets, 2.380 +we have extended Caret with routines that allow the depth of the ROI for volume-to-surface projection to vary. 2.381 +In the Research Plan, we describe how we will automatically locate the layer depths. For validation, we have manually 2.382 +demarcated the depth of the outer boundary of cortical layer 5 throughout the cortex. 2.383 +Feature selection and scoring methods 2.384 +Underexpression of a gene can serve as a marker Underexpression of a gene can sometimes serve as a marker. See, 2.385 +for example, Figure 2. 2.386 2.387 2.388 Figure 4: Upper left: wwc1. Upper right: 2.389 mtif2. Lower left: wwc1 + mtif2 (each 2.390 pixel’s value on the lower left is the sum of 2.391 -the corresponding pixels in the upper row). Conditional entropy An information-theoretic scoring method is to find 2.392 +the corresponding pixels in the upper row). Correlation Recall that the instances are surface pixels, and consider the 2.393 + problem of attempting to classify each instance as either a member of a partic- 2.394 + ular anatomical area, or not. The target area can be represented as a boolean 2.395 + mask over the surface pixels. 2.396 + One class of feature selection scoring methods contains methods which cal- 2.397 + culate some sort of “match” between each gene image and the target image. 2.398 + Those genes which match the best are good candidates for features. 2.399 + One of the simplest methods in this class is to use correlation as the match 2.400 + score. We calculated the correlation between each gene and each cortical area. 2.401 + The top row of Figure 1 shows the three genes most correlated with area SS. 2.402 + Conditional entropy An information-theoretic scoring method is to find 2.403 features such that, if the features (gene expression levels) are known, uncer- 2.404 tainty about the target (the regional identity) is reduced. Entropy measures 2.405 uncertainty, so what we want is to find features such that the conditional dis- 2.406 @@ -425,20 +436,16 @@ 2.407 referring is the probability distribution over the population of surface pixels. 2.408 The simplest way to use information theory is on discrete data, so we 2.409 discretized our gene expression data by creating, for each gene, five thresholded 2.410 - boolean masks of the gene data. For each gene, we created a boolean mask of 2.411 - its expression levels using each of these thresholds: the mean of that gene, the 2.412 - mean minus one standard deviation, the mean minus two standard deviations, 2.413 - the mean plus one standard deviation, the mean plus two standard deviations. 2.414 - Now, for each region, we created and ran a forward stepwise procedure 2.415 - which attempted to find pairs of gene expression boolean masks such that the 2.416 - conditional entropy of the target area’s boolean mask, conditioned upon the 2.417 - pair of gene expression boolean masks, is minimized. 2.418 - This finds pairs of genes which are most informative (at least at these dis- 2.419 - cretization thresholds) relative to the question, “Is this surface pixel a member 2.420 - of the target area?”. Its advantage over linear methods such as logistic regres- 2.421 -sion is that it takes account of arbitrarily nonlinear relationships; for example, if the XOR of two variables predicts the 2.422 -target, conditional entropy would notice, whereas linear methods would not. 2.423 - 2.424 + boolean masks of the gene data. For each gene, we created a boolean mask 2.425 +of its expression levels using each of these thresholds: the mean of that gene, the mean minus one standard deviation, the 2.426 +mean minus two standard deviations, the mean plus one standard deviation, the mean plus two standard deviations. 2.427 +Now, for each region, we created and ran a forward stepwise procedure which attempted to find pairs of gene expression 2.428 +boolean masks such that the conditional entropy of the target area’s boolean mask, conditioned upon the pair of gene 2.429 +expression boolean masks, is minimized. 2.430 +This finds pairs of genes which are most informative (at least at these discretization thresholds) relative to the question, 2.431 +“Is this surface pixel a member of the target area?”. Its advantage over linear methods such as logistic regression is that it 2.432 +takes account of arbitrarily nonlinear relationships; for example, if the XOR of two variables predicts the target, conditional 2.433 +entropy would notice, whereas linear methods would not. 2.434 2.435 2.436 2.437 @@ -508,20 +515,20 @@ 2.438 chose a “difficult” area in order to better contrast pointwise with geometric 2.439 methods. 2.440 Areas which can be identified by single genes Using gradient simi- 2.441 -larity, we have already found single genes which roughly identify some areas and groupings of areas. For each of these areas, 2.442 -an example of a gene which roughly identifies it is shown in Figure 5. We have not yet cross-verified these genes in other 2.443 -atlases. 2.444 -In addition, there are a number of areas which are almost identified by single genes: COAa+NLOT (anterior part of 2.445 -cortical amygdalar area, nucleus of the lateral olfactory tract), ENT (entorhinal), ACAv (ventral anterior cingulate), VIS 2.446 -(visual), AUD (auditory). 2.447 -These results validate our expectation that the ABA dataset can be exploited to find marker genes for many cortical 2.448 -areas, while also validating the relevancy of our new scoring method, gradient similarity. 2.449 + larity, we have already found single genes which roughly identify some areas 2.450 +and groupings of areas. For each of these areas, an example of a gene which roughly identifies it is shown in Figure 5. We 2.451 +have not yet cross-verified these genes in other atlases. 2.452 _________________________________________ 2.453 17For each gene, a logistic regression in which the response variable was whether or not a surface pixel was within area AUD, and the predictor 2.454 variable was the value of the expression of the gene underneath that pixel. The resulting scores were used to rank the genes in terms of how well 2.455 they predict area AUD. 2.456 18For each gene the gradient similarity between (a) a map of the expression of each gene on the cortical surface and (b) the shape of area AUD, 2.457 was calculated, and this was used to rank the genes. 2.458 +In addition, there are a number of areas which are almost identified by single genes: COAa+NLOT (anterior part of 2.459 +cortical amygdalar area, nucleus of the lateral olfactory tract), ENT (entorhinal), ACAv (ventral anterior cingulate), VIS 2.460 +(visual), AUD (auditory). 2.461 +These results validate our expectation that the ABA dataset can be exploited to find marker genes for many cortical 2.462 +areas, while also validating the relevancy of our new scoring method, gradient similarity. 2.463 Combinations of multiple genes are useful and necessary for some areas 2.464 In Figure 4, we give an example of a cortical area which is not marked by any single gene, but which can be identified 2.465 combinatorially. Acccording to logistic regression, gene wwc1 is the best fit single gene for predicting whether or not a 2.466 @@ -533,10 +540,10 @@ 2.467 on the medial surface. By adding together the values at each pixel in these two figures, we get the lower-left image. This 2.468 combination captures area MO much better than any single gene. 2.469 This shows that our proposal to develop a method to find combinations of marker genes is both possible and necessary. 2.470 -Feature selection integrated with prediction As noted earlier, in general, any predictive method can be used for 2.471 -feature selection by running it inside a stepwise wrapper. Also, some predictive methods integrate soft constraints on number 2.472 -of features used. Examples of both of these will be seen in the section “Multivariate Predictive methods”. 2.473 -Multivariate Predictive methods 2.474 +Feature selection integrated with prediction As noted earlier, in general, any classifier can be used for feature 2.475 +selection by running it inside a stepwise wrapper. Also, some learning algorithms integrate soft constraints on number of 2.476 +features used. Examples of both of these will be seen in the section “Multivariate supervised learning”. 2.477 +Multivariate supervised learning 2.478 2.479 2.480 2.481 @@ -578,6 +585,8 @@ 2.482 that looks at all the genes at once isn’t as prac- 2.483 tically useful as a classifier that uses only a few 2.484 genes. 2.485 +_________________________________________ 2.486 + 195-fold cross-validation. 2.487 Data-driven redrawing of the cor- 2.488 tical map 2.489 We have applied the following dimensionality reduction algorithms to reduce the dimensionality of the gene expression 2.490 @@ -585,8 +594,6 @@ 2.491 (MDS), Isomap, Landmark Isomap, Laplacian eigenmaps, Local Tangent Space Alignment (LTSA), Hessian locally linear 2.492 embedding, Diffusion maps, Stochastic Neighbor Embedding (SNE), Stochastic Proximity Embedding (SPE), Fast Maximum 2.493 Variance Unfolding (FastMVU), Non-negative Matrix Factorization (NNMF). Space constraints prevent us from showing 2.494 -_________________________________________ 2.495 - 195-fold cross-validation. 2.496 many of the results, but as a sample, PCA, NNMF, and landmark Isomap are shown in the first, second, and third rows of 2.497 Figure 6. 2.498 After applying the dimensionality reduction, we ran clustering algorithms on the reduced data. To date we have tried 2.499 @@ -623,10 +630,12 @@ 2.500 geometric, and information-theoretic measures. We already developed one entirely new scoring method (gradient similarity), 2.501 but we may develop more. Scoring measures that we will explore will include the L1 norm, correlation, expression energy 2.502 ratio, conditional entropy, gradient similarity, Jaccard similarity, Dice similarity, Hough transform, and statistical tests such 2.503 -as Student’s t-test, and the Mann-Whitney U test (a non-parametric test). In addition, any predictive procedure induces a 2.504 -scoring measure on genes by taking the prediction error when using that gene to predict the target. 2.505 +as Student’s t-test, and the Mann-Whitney U test (a non-parametric test). In addition, any classifier induces a scoring 2.506 +measure on genes by taking the prediction error when using that gene to predict the target. 2.507 Using some combination of these measures, we will develop a procedure to find single marker genes for anatomical regions: 2.508 -for each cortical area, we will rank the genes by their ability to delineate each area. 2.509 +for each cortical area, we will rank the genes by their ability to delineate each area. We will quantitatively compare the list 2.510 +of single genes generated by our method to the lists generated by previous methods which are mentioned in Aim 1 Related 2.511 +Work. 2.512 Some cortical areas have no single marker genes but can be identified by combinatorial coding. This requires multivariate 2.513 scoring measures and feature selection procedures. Many of the measures, such as expression energy, gradient similarity, 2.514 Jaccard, Dice, Hough, Student’s t, and Mann-Whitney U are univariate. We will extend these scoring measures for use 2.515 @@ -635,33 +644,35 @@ 2.516 Hotelling’s T-square is a multivariate analog of Student’s t. 2.517 We will develop a feature selection procedure for choosing the best small set of marker genes for a given anatomical 2.518 area. In addition to using the scoring measures that we develop, we will also explore (a) feature selection using a stepwise 2.519 -wrapper over “vanilla” predictive methods such as logistic regression, (b) predictive methods such as decision trees which 2.520 -incrementally/greedily combine single gene markers into sets, and (c) predictive methods which use soft constraints to 2.521 -minimize number of features used, such as sparse support vector machines. 2.522 -todo 2.523 -Some of these methods, such as the Hough transform, are designed to be resistant to registration error and error in the 2.524 -anatomical map. 2.525 -We will also consider extensions to scoring measures that may improve their robustness to registration error and to 2.526 -error in the anatomical map; for example, a wrapper that runs a scoring method on small displacements and distortions 2.527 -of the data adds robustness to registration error at the expense of computation time. It is possible that some areas in the 2.528 -anatomical map do not correspond to natural domains of gene expression. 2.529 -# Extend the procedure to handle difficult areas by combining or redrawing the boundaries: An area may be difficult to 2.530 -identify because the boundaries are misdrawn, or because it does not “really” exist as a single area, at least on the genetic 2.531 -level. We will develop extensions to our procedure which (a) detect when a difficult area could be fit if its boundary were 2.532 -redrawn slightly, and (b) detect when a difficult area could be combined with adjacent areas to create a larger area which 2.533 -can be fit. 2.534 +wrapper over “vanilla” classifiers such as logistic regression, (b) supervised learning methods such as decision trees which 2.535 +incrementally/greedily combine single gene markers into sets, and (c) supervised learning methods which use soft constraints 2.536 +to minimize number of features used, such as sparse support vector machines. 2.537 +Since errors of displacement and of shape may cause genes and target areas to match less than they should, we will 2.538 +consider the robustness of feature selection methods in the presence of error. Some of these methods, such as the Hough 2.539 +transform, are designed to be resistant in the presence of error, but many are not. We will consider extensions to scoring 2.540 +measures that may improve their robustness; for example, a wrapper that runs a scoring method on small displacements 2.541 +and distortions of the data adds robustness to registration error at the expense of computation time. 2.542 +An area may be difficult to identify because the boundaries are misdrawn in the atlas, or because the shape of the natural 2.543 +domain of gene expression corresponding to the area is different from the shape of the area as recognized by anatomists. 2.544 +We will extend our procedure to handle difficult areas by combining areas or redrawing their boundaries. We will develop 2.545 +extensions to our procedure which (a) detect when a difficult area could be fit if its boundary were redrawn slightly, and (b) 2.546 +detect when a difficult area could be combined with adjacent areas to create a larger area which can be fit. 2.547 A future publication on the method that we develop in Aim 1 will review the scoring measures and quantitatively compare 2.548 their performance in order to provide a foundation for future research of methods of marker gene finding. We will measure 2.549 the robustness of the scoring measures as well as their absolute performance on our dataset. 2.550 -Decision trees todo 2.551 -20. 2.552 +Classifiers 2.553 +We will explore and compare different classifiers. As noted above, this activity is not separate from the previous one, 2.554 +because some supervised learning algorithms include feature selection, and any classifier can be combined with a stepwise 2.555 +wrapper for use as a feature selection method. We will explore logistic regression (including spatial models[15]), decision 2.556 +trees20 , sparse SVMs, generative mixture models (including naive bayes), kernel density estimation, genetic algorithms, and 2.557 +artificial neural networks. 2.558 +Decision trees 2.559 # confirm with EMAGE, GeneAtlas, GENSAT, etc, to fight overfitting, two hemis 2.560 -# mixture models, etc 2.561 Develop algorithms to suggest a division of a structure into anatomical parts 2.562 1.Explore dimensionality reduction algorithms applied to pixels: including TODO 2.563 2.Explore dimensionality reduction algorithms applied to genes: including TODO 2.564 3.Explore clustering algorithms applied to pixels: including TODO 2.565 -4.Explore clustering algorithms applied to genes: including gene shaving, TODO 2.566 +4.Explore clustering algorithms applied to genes: including gene shaving[9], TODO 2.567 5.Develop an algorithm to use dimensionality reduction and/or hierarchial clustering to create anatomical maps 2.568 6.Run this algorithm on the cortex: present a hierarchial, genoarchitectonic map of the cortex 2.569 # Linear discriminant analysis 2.570 @@ -677,6 +688,10 @@ 2.571 developed in Aim 2, we will present one or more hierarchial cortical maps. We will identify and explain how the statistical 2.572 structure in the gene expression data led to any unexpected or interesting features of these maps, and we will provide 2.573 biological hypotheses to interpret any new cortical areas, or groupings of areas, which are discovered. 2.574 +_________________________________________ 2.575 + 20Actually, we have already begun to explore decision trees. For each cortical area, we have used the C4.5 algorithm to find a decision tree for 2.576 +that area. We achieved good classification accuracy on our training set, but the number of genes that appeared in each tree was too large. We 2.577 +plan to implement a pruning procedure to generate trees that use fewer genes. 2.578 Timeline and milestones 2.579 Finding marker genes 2.580 ∙September-November 2009: Develop an automated mechanism for segmenting the cortical voxels into layers 2.581 @@ -686,10 +701,6 @@ 2.582 test out various dimensionality reduction schemes in combination with supervised learning. create or extend supervised 2.583 learning frameworks which use multivariate versions of the best scoring methods. 2.584 ∙January 2010 (milestone): Submit a publication on single marker genes for cortical areas 2.585 -_________________________________________ 2.586 - 20Already, for each cortical area, we have used the C4.5 algorithm to find a decision tree for that area. We achieved good classification accuracy 2.587 -on our training set, but the number of genes that appeared in each tree was too large. We plan to implement a pruning procedure to generate 2.588 -trees that use fewer genes 2.589 ∙February-July 2010: Continue to develop scoring methods and supervised learning frameworks. Explore the best way 2.590 to integrate radial profiles with supervised learning. Explore the best way to make supervised learning techniques 2.591 robust against incorrect labels (i.e. when the areas drawn on the input cortical map are slightly off). Quantitatively 2.592 @@ -736,52 +747,57 @@ 2.593 [8]Shiaoching Gong, Chen Zheng, Martin L. Doughty, Kasia Losos, Nicholas Didkovsky, Uta B. Schambra, Norma J. 2.594 Nowak, Alexandra Joyner, Gabrielle Leblanc, Mary E. Hatten, and Nathaniel Heintz. A gene expression atlas of the 2.595 central nervous system based on bacterial artificial chromosomes. Nature, 425(6961):917–925, October 2003. 2.596 -[9]Jano Hemert and Richard Baldock. Matching Spatial Regions with Combinations of Interacting Gene Expression Pat- 2.597 +[9]Trevor Hastie, Robert Tibshirani, Michael Eisen, Ash Alizadeh, Ronald Levy, Louis Staudt, Wing Chan, David Botstein, 2.598 +and Patrick Brown. ’Gene shaving’ as a method for identifying distinct sets of genes with similar expression patterns. 2.599 +Genome Biology, 1(2):research0003.1–research0003.21, 2000. 2.600 +[10]Jano Hemert and Richard Baldock. Matching Spatial Regions with Combinations of Interacting Gene Expression Pat- 2.601 terns, volume 13 of Communications in Computer and Information Science, pages 347–361. Springer Berlin Heidelberg, 2.602 2008. 2.603 -[10]F. Kruggel, M. K. Brckner, Th. Arendt, C. J. Wiggins, and D. Y. von Cramon. Analyzing the neocortical fine-structure. 2.604 +[11]F. Kruggel, M. K. Brckner, Th. Arendt, C. J. Wiggins, and D. Y. von Cramon. Analyzing the neocortical fine-structure. 2.605 Medical Image Analysis, 7(3):251–264, September 2003. 2.606 -[11]Erh-Fang Lee, Jyl Boline, and Arthur W. Toga. A High-Resolution anatomical framework of the neonatal mouse brain 2.607 +[12]Erh-Fang Lee, Jyl Boline, and Arthur W. Toga. A High-Resolution anatomical framework of the neonatal mouse brain 2.608 for managing gene expression data. Frontiers in Neuroinformatics, 1:6, 2007. PMC2525996. 2.609 -[12]Susan Magdaleno, Patricia Jensen, Craig L. Brumwell, Anna Seal, Karen Lehman, Andrew Asbury, Tony Cheung, 2.610 +[13]Susan Magdaleno, Patricia Jensen, Craig L. Brumwell, Anna Seal, Karen Lehman, Andrew Asbury, Tony Cheung, 2.611 Tommie Cornelius, Diana M. Batten, Christopher Eden, Shannon M. Norland, Dennis S. Rice, Nilesh Dosooye, Sundeep 2.612 Shakya, Perdeep Mehta, and Tom Curran. BGEM: an in situ hybridization database of gene expression in the embryonic 2.613 and adult mouse nervous system. PLoS Biology, 4(4):e86 EP –, April 2006. 2.614 -[13]Lydia Ng, Amy Bernard, Chris Lau, Caroline C Overly, Hong-Wei Dong, Chihchau Kuan, Sayan Pathak, Susan M 2.615 +[14]Lydia Ng, Amy Bernard, Chris Lau, Caroline C Overly, Hong-Wei Dong, Chihchau Kuan, Sayan Pathak, Susan M 2.616 Sunkin, Chinh Dang, Jason W Bohland, Hemant Bokil, Partha P Mitra, Luis Puelles, John Hohmann, David J Anderson, 2.617 Ed S Lein, Allan R Jones, and Michael Hawrylycz. An anatomic gene expression atlas of the adult mouse brain. Nat 2.618 Neurosci, 12(3):356–362, March 2009. 2.619 -[14]George Paxinos and Keith B.J. Franklin. The Mouse Brain in Stereotaxic Coordinates. Academic Press, 2 edition, July 2.620 +[15]Christopher J. Paciorek. Computational techniques for spatial logistic regression with large data sets. Computational 2.621 +Statistics & Data Analysis, 51(8):3631–3653, May 2007. 2.622 +[16]George Paxinos and Keith B.J. Franklin. The Mouse Brain in Stereotaxic Coordinates. Academic Press, 2 edition, July 2.623 2001. 2.624 -[15]A. Schleicher, N. Palomero-Gallagher, P. Morosan, S. Eickhoff, T. Kowalski, K. Vos, K. Amunts, and K. Zilles. Quanti- 2.625 +[17]A. Schleicher, N. Palomero-Gallagher, P. Morosan, S. Eickhoff, T. Kowalski, K. Vos, K. Amunts, and K. Zilles. Quanti- 2.626 tative architectural analysis: a new approach to cortical mapping. Anatomy and Embryology, 210(5):373–386, December 2.627 2005. 2.628 -[16]Oliver Schmitt, Lars Hmke, and Lutz Dmbgen. Detection of cortical transition regions utilizing statistical analyses of 2.629 +[18]Oliver Schmitt, Lars Hmke, and Lutz Dmbgen. Detection of cortical transition regions utilizing statistical analyses of 2.630 excess masses. NeuroImage, 19(1):42–63, May 2003. 2.631 -[17]Constance M. Smith, Jacqueline H. Finger, Terry F. Hayamizu, Ingeborg J. McCright, Janan T. Eppig, James A. 2.632 +[19]Constance M. Smith, Jacqueline H. Finger, Terry F. Hayamizu, Ingeborg J. McCright, Janan T. Eppig, James A. 2.633 Kadin, Joel E. Richardson, and Martin Ringwald. The mouse gene expression database (GXD): 2007 update. Nucl. 2.634 Acids Res., 35(suppl_1):D618–623, 2007. 2.635 -[18]Judy Sprague, Leyla Bayraktaroglu, Dave Clements, Tom Conlin, David Fashena, Ken Frazer, Melissa Haendel, Dou- 2.636 +[20]Judy Sprague, Leyla Bayraktaroglu, Dave Clements, Tom Conlin, David Fashena, Ken Frazer, Melissa Haendel, Dou- 2.637 glas G Howe, Prita Mani, Sridhar Ramachandran, Kevin Schaper, Erik Segerdell, Peiran Song, Brock Sprunger, Sierra 2.638 Taylor, Ceri E Van Slyke, and Monte Westerfield. The zebrafish information network: the zebrafish model organism 2.639 database. Nucleic Acids Research, 34(Database issue):D581–5, 2006. PMID: 16381936. 2.640 -[19]Larry Swanson. Brain Maps: Structure of the Rat Brain. Academic Press, 3 edition, November 2003. 2.641 -[20]Carol L. Thompson, Sayan D. Pathak, Andreas Jeromin, Lydia L. Ng, Cameron R. MacPherson, Marty T. Mortrud, 2.642 +[21]Larry Swanson. Brain Maps: Structure of the Rat Brain. Academic Press, 3 edition, November 2003. 2.643 +[22]Carol L. Thompson, Sayan D. Pathak, Andreas Jeromin, Lydia L. Ng, Cameron R. MacPherson, Marty T. Mortrud, 2.644 Allison Cusick, Zackery L. Riley, Susan M. Sunkin, Amy Bernard, Ralph B. Puchalski, Fred H. Gage, Allan R. Jones, 2.645 Vladimir B. Bajic, Michael J. Hawrylycz, and Ed S. Lein. Genomic anatomy of the hippocampus. Neuron, 60(6):1010– 2.646 1021, December 2008. 2.647 -[21]Pavel Tomancak, Amy Beaton, Richard Weiszmann, Elaine Kwan, ShengQiang Shu, Suzanna E Lewis, Stephen 2.648 +[23]Pavel Tomancak, Amy Beaton, Richard Weiszmann, Elaine Kwan, ShengQiang Shu, Suzanna E Lewis, Stephen 2.649 Richards, Michael Ashburner, Volker Hartenstein, Susan E Celniker, and Gerald M Rubin. Systematic determina- 2.650 tion of patterns of gene expression during drosophila embryogenesis. Genome Biology, 3(12):research008818814, 2002. 2.651 PMC151190. 2.652 -[22]Jano van Hemert and Richard Baldock. Mining Spatial Gene Expression Data for Association Rules, volume 4414/2007 2.653 +[24]Jano van Hemert and Richard Baldock. Mining Spatial Gene Expression Data for Association Rules, volume 4414/2007 2.654 of Lecture Notes in Computer Science, pages 66–76. Springer Berlin / Heidelberg, 2007. 2.655 -[23]Shanmugasundaram Venkataraman, Peter Stevenson, Yiya Yang, Lorna Richardson, Nicholas Burton, Thomas P. Perry, 2.656 +[25]Shanmugasundaram Venkataraman, Peter Stevenson, Yiya Yang, Lorna Richardson, Nicholas Burton, Thomas P. Perry, 2.657 Paul Smith, Richard A. Baldock, Duncan R. Davidson, and Jeffrey H. Christiansen. EMAGE edinburgh mouse atlas 2.658 of gene expression: 2008 update. Nucl. Acids Res., 36(suppl_1):D860–865, 2008. 2.659 -[24]Axel Visel, Christina Thaller, and Gregor Eichele. GenePaint.org: an atlas of gene expression patterns in the mouse 2.660 +[26]Axel Visel, Christina Thaller, and Gregor Eichele. GenePaint.org: an atlas of gene expression patterns in the mouse 2.661 embryo. Nucl. Acids Res., 32(suppl_1):D552–556, 2004. 2.662 -[25]Robert H Waterston, Kerstin Lindblad-Toh, Ewan Birney, Jane Rogers, Josep F Abril, Pankaj Agarwal, Richa Agar- 2.663 +[27]Robert H Waterston, Kerstin Lindblad-Toh, Ewan Birney, Jane Rogers, Josep F Abril, Pankaj Agarwal, Richa Agar- 2.664 wala, Rachel Ainscough, Marina Alexandersson, Peter An, Stylianos E Antonarakis, John Attwood, Robert Baertsch, 2.665 Jonathon Bailey, Karen Barlow, Stephan Beck, Eric Berry, Bruce Birren, Toby Bloom, Peer Bork, Marc Botcherby, 2.666 Nicolas Bray, Michael R Brent, Daniel G Brown, Stephen D Brown, Carol Bult, John Burton, Jonathan Butler,
3.1 Binary file grant.odt has changed
4.1 Binary file grant.pdf has changed
5.1 --- a/grant.txt Tue Apr 21 14:50:10 2009 -0700 5.2 +++ b/grant.txt Tue Apr 21 17:35:00 2009 -0700 5.3 @@ -14,7 +14,7 @@ 5.4 5.5 (3) create a 2-D "flat map" dataset of the mouse cerebral cortex that contains a flattened version of the Allen Mouse Brain Atlas ISH data, as well as the boundaries of cortical anatomical areas. This will involve extending the functionality of Caret, an existing open-source scientific imaging program. Use this dataset to validate the methods developed in (1) and (2).\\ 5.6 5.7 -Although our particular application involves the 3D spatial distribution of gene expression, we anticipate that the methods developed in aims (1) and (2) will generalize to any sort of high-dimensional data over points located in a low-dimensional space. 5.8 +Although our particular application involves the 3D spatial distribution of gene expression, we anticipate that the methods developed in aims (1) and (2) will generalize to any sort of high-dimensional data over points located in a low-dimensional space. In particular, our method could be applied to genome-wide sequencing data derived from sets of tissues and disease states. 5.9 5.10 In terms of the application of the methods to cerebral cortex, aim (1) is to go from cortical areas to marker genes, and aim (2) is to let the gene profile define the cortical areas. In addition to validating the usefulness of the algorithms, the application of these methods to cortex will produce immediate benefits, because there are currently no known genetic markers for most cortical areas. The results of the project will support the development of new ways to selectively target cortical areas, and it will support the development of a method for identifying the cortical areal boundaries present in small tissue samples. 5.11 5.12 @@ -29,11 +29,11 @@ 5.13 5.14 == The Challenge and Potential impact == 5.15 5.16 -Now we will discuss each of our three aims in turn. For each aim, we will develop a conceptual framework for thinking about the task, and we will present our strategy for solving it. Next we will discuss related work. At the conclusion of each section, we will summarize why our strategy is different from what has been done before. At the end of this section, we will describe the potential impact. 5.17 +Each of our three aims will be discussed in turn. For each aim, we will develop a conceptual framework for thinking about the task, and we will present our strategy for solving it. Next we will discuss related work. At the conclusion of each section, we will summarize why our strategy is different from what has been done before. At the end of this section, we will describe the potential impact. 5.18 5.19 === Aim 1: Given a map of regions, find genes that mark the regions === 5.20 5.21 -\vspace{0.3cm}**Machine learning terminology** The task of looking for marker genes for known anatomical regions means that one is looking for a set of genes such that, if the expression level of those genes is known, then the locations of the regions can be inferred. 5.22 +\vspace{0.3cm}**Machine learning terminology: classifiers** The task of looking for marker genes for known anatomical regions means that one is looking for a set of genes such that, if the expression level of those genes is known, then the locations of the regions can be inferred. 5.23 5.24 %% then instead of saying that we are using gene expression to find the locations of the regions, 5.25 5.26 @@ -41,7 +41,7 @@ 5.27 5.28 %%Therefore, an understanding of the relationship between the combination of their expression levels and the locations of the regions may be expressed as a function. The input to this function is a voxel, along with the gene expression levels within that voxel; the output is the regional identity of the target voxel, that is, the region to which the target voxel belongs. We call this function a __classifier__. In general, the input to a classifier is called an __instance__, and the output is called a __label__ (or a __class label__). 5.29 5.30 -If we define the regions so that they cover the entire anatomical structure to be divided, we may say that we are using gene expression to determine to which region each voxel within the structure belongs. We call this a __classification task__, because each voxel is being assigned to a class (namely, its region). An understanding of the relationship between the combination of their expression levels and the locations of the regions may be expressed as a function. The input to this function is a voxel, along with the gene expression levels within that voxel; the output is the regional identity of the target voxel, that is, the region to which the target voxel belongs. We call this function a __classifier__. In general, the input to a classifier is called an __instance__, and the output is called a __label__ (or a __class label__). 5.31 +If we define the regions so that they cover the entire anatomical structure to be subdivided, we may say that we are using gene expression in each voxel to assign that voxel to the proper area. We call this a __classification task__, because each voxel is being assigned to a class (namely, its region). An understanding of the relationship between the combination of their expression levels and the locations of the regions may be expressed as a function. The input to this function is a voxel, along with the gene expression levels within that voxel; the output is the regional identity of the target voxel, that is, the region to which the target voxel belongs. We call this function a __classifier__. In general, the input to a classifier is called an __instance__, and the output is called a __label__ (or a __class label__). 5.32 5.33 %% The construction of the classifier is called __training__ (also __learning__), and 5.34 5.35 @@ -53,6 +53,9 @@ 5.36 5.37 Although the classifier itself may only look at the gene expression data within each voxel before classifying that voxel, the algorithm which constructs the classifier may look over the entire dataset. We can categorize score-based feature selection methods depending on how the score of calculated. Often the score calculation consists of assigning a sub-score to each voxel, and then aggregating these sub-scores into a final score (the aggregation is often a sum or a sum of squares or average). If only information from nearby voxels is used to calculate a voxel's sub-score, then we say it is a __local scoring method__. If only information from the voxel itself is used to calculate a voxel's sub-score, then we say it is a __pointwise scoring method__. 5.38 5.39 +Both gene expression data and anatomical atlases have errors, due to a variety of factors. Individual subjects have idiosyncratic anatomy. Subjects may be improperly registred to the atlas. The method used to measure gene expression may be noisy. The atlas may have errors. It is even possible that some areas in the anatomical atlas are "wrong" in that they do not have the same shape as the natural domains of gene expression to which they correspond. These sources of error can affect the displacement and the shape of both the gene expression data and the anatomical target areas. Therefore, it is important to use feature selection methods which are robust to these kinds of errors. 5.40 + 5.41 + 5.42 === Our strategy for Aim 1 === 5.43 5.44 Key questions when choosing a learning method are: What are the instances? What are the features? How are the features chosen? Here are four principles that outline our answers to these questions. 5.45 @@ -290,19 +293,7 @@ 5.46 5.47 We downloaded the ABA data and applied a mask to select only those voxels which belong to cerebral cortex. We divided the cortex into hemispheres. 5.48 5.49 -Using Caret\cite{van_essen_integrated_2001}, we created a mesh representation of the surface of the selected voxels. For each gene, for each node of the mesh, we calculated an average of the gene expression of the voxels "underneath" that mesh node. We then flattened the cortex, creating a two-dimensional mesh. 5.50 - 5.51 -We sampled the nodes of the irregular, flat mesh in order to create a regular grid of pixel values. We converted this grid into a MATLAB matrix. 5.52 - 5.53 -We manually traced the boundaries of each of 49 cortical areas from the ABA coronal reference atlas slides. We then converted these manual traces into Caret-format regional boundary data on the mesh surface. We projected the regions onto the 2-d mesh, and then onto the grid, and then we converted the region data into MATLAB format. 5.54 - 5.55 -At this point, the data are in the form of a number of 2-D matrices, all in registration, with the matrix entries representing a grid of points (pixels) over the cortical surface: 5.56 - 5.57 - 5.58 - 5.59 -* A 2-D matrix whose entries represent the regional label associated with each surface pixel 5.60 -* For each gene, a 2-D matrix whose entries represent the average expression level underneath each surface pixel 5.61 - 5.62 +Using Caret\cite{van_essen_integrated_2001}, we created a mesh representation of the surface of the selected voxels. For each gene, and for each node of the mesh, we calculated an average of the gene expression of the voxels "underneath" that mesh node. We then flattened the cortex, creating a two-dimensional mesh. 5.63 5.64 5.65 \begin{wrapfigure}{L}{0.35\textwidth}\centering 5.66 @@ -316,6 +307,19 @@ 5.67 \caption{The top row shows the two genes which (individually) best predict area AUD, according to logistic regression. The bottom row shows the two genes which (individually) best match area AUD, according to gradient similarity. From left to right and top to bottom, the genes are $Ssr1$, $Efcbp1$, $Ptk7$, and $Aph1a$.} 5.68 \label{AUDgeometry}\end{wrapfigure} 5.69 5.70 +We sampled the nodes of the irregular, flat mesh in order to create a regular grid of pixel values. We converted this grid into a MATLAB matrix. 5.71 + 5.72 +We manually traced the boundaries of each of 49 cortical areas from the ABA coronal reference atlas slides. We then converted these manual traces into Caret-format regional boundary data on the mesh surface. We projected the regions onto the 2-d mesh, and then onto the grid, and then we converted the region data into MATLAB format. 5.73 + 5.74 +At this point, the data are in the form of a number of 2-D matrices, all in registration, with the matrix entries representing a grid of points (pixels) over the cortical surface: 5.75 + 5.76 + 5.77 + 5.78 +* A 2-D matrix whose entries represent the regional label associated with each surface pixel 5.79 +* For each gene, a 2-D matrix whose entries represent the average expression level underneath each surface pixel 5.80 + 5.81 + 5.82 + 5.83 We created a normalized version of the gene expression data by subtracting each gene's mean expression level (over all surface pixels) and dividing the expression level of each gene by its standard deviation. 5.84 5.85 The features and the target area are both functions on the surface pixels. They can be referred to as scalar fields over the space of surface pixels; alternately, they can be thought of as images which can be displayed on the flatmapped surface. 5.86 @@ -339,15 +343,6 @@ 5.87 5.88 5.89 5.90 -\vspace{0.3cm}**Correlation** 5.91 -Recall that the instances are surface pixels, and consider the problem of attempting to classify each instance as either a member of a particular anatomical area, or not. The target area can be represented as a boolean mask over the surface pixels. 5.92 - 5.93 -One class of feature selection scoring methods contains methods which calculate some sort of "match" between each gene image and the target image. Those genes which match the best are good candidates for features. 5.94 - 5.95 -One of the simplest methods in this class is to use correlation as the match score. We calculated the correlation between each gene and each cortical area. The top row of Figure \ref{SScorrLr} shows the three genes most correlated with area SS. 5.96 - 5.97 - 5.98 - 5.99 \begin{wrapfigure}{L}{0.35\textwidth}\centering 5.100 \includegraphics[scale=.27]{MO_vs_Wwc1_jet.eps}\includegraphics[scale=.27]{MO_vs_Mtif2_jet.eps} 5.101 5.102 @@ -355,6 +350,15 @@ 5.103 \caption{Upper left: $wwc1$. Upper right: $mtif2$. Lower left: wwc1 + mtif2 (each pixel's value on the lower left is the sum of the corresponding pixels in the upper row).} 5.104 \label{MOcombo}\end{wrapfigure} 5.105 5.106 +\vspace{0.3cm}**Correlation** 5.107 +Recall that the instances are surface pixels, and consider the problem of attempting to classify each instance as either a member of a particular anatomical area, or not. The target area can be represented as a boolean mask over the surface pixels. 5.108 + 5.109 +One class of feature selection scoring methods contains methods which calculate some sort of "match" between each gene image and the target image. Those genes which match the best are good candidates for features. 5.110 + 5.111 +One of the simplest methods in this class is to use correlation as the match score. We calculated the correlation between each gene and each cortical area. The top row of Figure \ref{SScorrLr} shows the three genes most correlated with area SS. 5.112 + 5.113 + 5.114 + 5.115 \vspace{0.3cm}**Conditional entropy** 5.116 An information-theoretic scoring method is to find features such that, if the features (gene expression levels) are known, uncertainty about the target (the regional identity) is reduced. Entropy measures uncertainty, so what we want is to find features such that the conditional distribution of the target has minimal entropy. The distribution to which we are referring is the probability distribution over the population of surface pixels. 5.117 5.118 @@ -424,10 +428,10 @@ 5.119 5.120 5.121 \vspace{0.3cm}**Feature selection integrated with prediction** 5.122 -As noted earlier, in general, any predictive method can be used for feature selection by running it inside a stepwise wrapper. Also, some predictive methods integrate soft constraints on number of features used. Examples of both of these will be seen in the section "Multivariate Predictive methods". 5.123 - 5.124 - 5.125 -=== Multivariate Predictive methods === 5.126 +As noted earlier, in general, any classifier can be used for feature selection by running it inside a stepwise wrapper. Also, some learning algorithms integrate soft constraints on number of features used. Examples of both of these will be seen in the section "Multivariate supervised learning". 5.127 + 5.128 + 5.129 +=== Multivariate supervised learning === 5.130 5.131 5.132 \begin{wrapfigure}{L}{0.6\textwidth}\centering 5.133 @@ -504,47 +508,39 @@ 5.134 5.135 %%We will develop scoring methods for evaluating how good individual genes are at marking areas. We will compare pointwise, geometric, and information-theoretic measures. We already developed one entirely new scoring method (gradient similarity), but we may develop more. Scoring measures that we will explore will include the L1 norm, correlation, expression energy ratio, conditional entropy, gradient similarity, Jaccard similarity, Dice similarity, Hough transform, and statistical tests such as Hotelling's T-square test (a multivariate generalization of Student's t-test), ANOVA, and a multivariate version of the Mann-Whitney U test (a non-parametric test). 5.136 5.137 -We will develop scoring methods for evaluating how good individual genes are at marking areas. We will compare pointwise, geometric, and information-theoretic measures. We already developed one entirely new scoring method (gradient similarity), but we may develop more. Scoring measures that we will explore will include the L1 norm, correlation, expression energy ratio, conditional entropy, gradient similarity, Jaccard similarity, Dice similarity, Hough transform, and statistical tests such as Student's t-test, and the Mann-Whitney U test (a non-parametric test). In addition, any predictive procedure induces a scoring measure on genes by taking the prediction error when using that gene to predict the target. 5.138 - 5.139 - 5.140 - 5.141 -Using some combination of these measures, we will develop a procedure to find single marker genes for anatomical regions: for each cortical area, we will rank the genes by their ability to delineate each area. 5.142 +We will develop scoring methods for evaluating how good individual genes are at marking areas. We will compare pointwise, geometric, and information-theoretic measures. We already developed one entirely new scoring method (gradient similarity), but we may develop more. Scoring measures that we will explore will include the L1 norm, correlation, expression energy ratio, conditional entropy, gradient similarity, Jaccard similarity, Dice similarity, Hough transform, and statistical tests such as Student's t-test, and the Mann-Whitney U test (a non-parametric test). In addition, any classifier induces a scoring measure on genes by taking the prediction error when using that gene to predict the target. 5.143 + 5.144 +Using some combination of these measures, we will develop a procedure to find single marker genes for anatomical regions: for each cortical area, we will rank the genes by their ability to delineate each area. We will quantitatively compare the list of single genes generated by our method to the lists generated by previous methods which are mentioned in Aim 1 Related Work. 5.145 + 5.146 5.147 Some cortical areas have no single marker genes but can be identified by combinatorial coding. This requires multivariate scoring measures and feature selection procedures. Many of the measures, such as expression energy, gradient similarity, Jaccard, Dice, Hough, Student's t, and Mann-Whitney U are univariate. We will extend these scoring measures for use in multivariate feature selection, that is, for scoring how well combinations of genes, rather than individual genes, can distinguish a target area. There are existing multivariate forms of some of the univariate scoring measures, for example, Hotelling's T-square is a multivariate analog of Student's t. 5.148 5.149 -We will develop a feature selection procedure for choosing the best small set of marker genes for a given anatomical area. In addition to using the scoring measures that we develop, we will also explore (a) feature selection using a stepwise wrapper over "vanilla" predictive methods such as logistic regression, (b) predictive methods such as decision trees which incrementally/greedily combine single gene markers into sets, and (c) predictive methods which use soft constraints to minimize number of features used, such as sparse support vector machines. 5.150 - 5.151 -todo 5.152 - 5.153 -Some of these methods, such as the Hough transform, are designed to be resistant to registration error and error in the anatomical map. 5.154 - 5.155 -We will also consider extensions to scoring measures that may improve their robustness to registration error and to error in the anatomical map; for example, a wrapper that runs a scoring method on small displacements and distortions of the data adds robustness to registration error at the expense of computation time. It is possible that some areas in the anatomical map do not correspond to natural domains of gene expression. 5.156 - 5.157 -# Extend the procedure to handle difficult areas by combining or redrawing the boundaries: An area may be difficult to identify because the boundaries are misdrawn, or because it does not "really" exist as a single area, at least on the genetic level. We will develop extensions to our procedure which (a) detect when a difficult area could be fit if its boundary were redrawn slightly, and (b) detect when a difficult area could be combined with adjacent areas to create a larger area which can be fit. 5.158 - 5.159 +We will develop a feature selection procedure for choosing the best small set of marker genes for a given anatomical area. In addition to using the scoring measures that we develop, we will also explore (a) feature selection using a stepwise wrapper over "vanilla" classifiers such as logistic regression, (b) supervised learning methods such as decision trees which incrementally/greedily combine single gene markers into sets, and (c) supervised learning methods which use soft constraints to minimize number of features used, such as sparse support vector machines. 5.160 + 5.161 +Since errors of displacement and of shape may cause genes and target areas to match less than they should, we will consider the robustness of feature selection methods in the presence of error. Some of these methods, such as the Hough transform, are designed to be resistant in the presence of error, but many are not. We will consider extensions to scoring measures that may improve their robustness; for example, a wrapper that runs a scoring method on small displacements and distortions of the data adds robustness to registration error at the expense of computation time. 5.162 + 5.163 +An area may be difficult to identify because the boundaries are misdrawn in the atlas, or because the shape of the natural domain of gene expression corresponding to the area is different from the shape of the area as recognized by anatomists. We will extend our procedure to handle difficult areas by combining areas or redrawing their boundaries. We will develop extensions to our procedure which (a) detect when a difficult area could be fit if its boundary were redrawn slightly, and (b) detect when a difficult area could be combined with adjacent areas to create a larger area which can be fit. 5.164 5.165 A future publication on the method that we develop in Aim 1 will review the scoring measures and quantitatively compare their performance in order to provide a foundation for future research of methods of marker gene finding. We will measure the robustness of the scoring measures as well as their absolute performance on our dataset. 5.166 5.167 +\vspace{0.3cm}**Classifiers** 5.168 + 5.169 +We will explore and compare different classifiers. As noted above, this activity is not separate from the previous one, because some supervised learning algorithms include feature selection, and any classifier can be combined with a stepwise wrapper for use as a feature selection method. We will explore logistic regression (including spatial models\cite{paciorek_computational_2007}), decision trees\footnote{Actually, we have already begun to explore decision trees. For each cortical area, we have used the C4.5 algorithm to find a decision tree for that area. We achieved good classification accuracy on our training set, but the number of genes that appeared in each tree was too large. We plan to implement a pruning procedure to generate trees that use fewer genes.}, sparse SVMs, generative mixture models (including naive bayes), kernel density estimation, genetic algorithms, and artificial neural networks. 5.170 + 5.171 5.172 5.173 \vspace{0.3cm}**Decision trees** 5.174 -todo 5.175 - 5.176 -\footnote{Already, for each cortical area, we have used the C4.5 algorithm to find a decision tree for that area. We achieved good classification accuracy on our training set, but the number of genes that appeared in each tree was too large. We plan to implement a pruning procedure to generate trees that use fewer genes}. 5.177 + 5.178 5.179 # confirm with EMAGE, GeneAtlas, GENSAT, etc, to fight overfitting, two hemis 5.180 5.181 -# mixture models, etc 5.182 - 5.183 - 5.184 - 5.185 5.186 \vspace{0.3cm}**Develop algorithms to suggest a division of a structure into anatomical parts** 5.187 5.188 # Explore dimensionality reduction algorithms applied to pixels: including TODO 5.189 # Explore dimensionality reduction algorithms applied to genes: including TODO 5.190 # Explore clustering algorithms applied to pixels: including TODO 5.191 -# Explore clustering algorithms applied to genes: including gene shaving, TODO 5.192 +# Explore clustering algorithms applied to genes: including gene shaving\cite{hastie_gene_2000}, TODO 5.193 # Develop an algorithm to use dimensionality reduction and/or hierarchial clustering to create anatomical maps 5.194 # Run this algorithm on the cortex: present a hierarchial, genoarchitectonic map of the cortex 5.195