nsf
changeset 15:395faa66383e
.
| author | bshanks@bshanks.dyndns.org | 
|---|---|
| date | Sun Apr 12 02:49:55 2009 -0700 (16 years ago) | 
| parents | 56a898ced81d | 
| children | 796116742ec5 | 
| files | DocumentConverter.py grant-oldtext.txt grant.doc grant.html grant.odt grant.pdf grant.txt | 
   line diff
     1.1 --- a/DocumentConverter.py	Sat Apr 11 21:43:12 2009 -0700
     1.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.3 @@ -1,151 +0,0 @@
     1.4 -#!/usr/bin/python
     1.5 -#
     1.6 -# PyODConverter (Python OpenDocument Converter) v1.0.0 - 2008-05-05
     1.7 -#
     1.8 -# This script converts a document from one office format to another by
     1.9 -# connecting to an OpenOffice.org instance via Python-UNO bridge.
    1.10 -#
    1.11 -# Copyright (C) 2008 Mirko Nasato <mirko@artofsolving.com>
    1.12 -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl-2.1.html
    1.13 -# - or any later version.
    1.14 -#
    1.15 -DEFAULT_OPENOFFICE_PORT = 8100
    1.16 -
    1.17 -import uno
    1.18 -from os.path import abspath, isfile, splitext
    1.19 -from com.sun.star.beans import PropertyValue
    1.20 -from com.sun.star.task import ErrorCodeIOException
    1.21 -from com.sun.star.connection import NoConnectException
    1.22 -
    1.23 -FAMILY_TEXT = "Text"
    1.24 -FAMILY_SPREADSHEET = "Spreadsheet"
    1.25 -FAMILY_PRESENTATION = "Presentation"
    1.26 -FAMILY_DRAWING = "Drawing"
    1.27 -
    1.28 -FILTER_MAP = {
    1.29 -    "pdf": {
    1.30 -        FAMILY_TEXT: "writer_pdf_Export",
    1.31 -        FAMILY_SPREADSHEET: "calc_pdf_Export",
    1.32 -        FAMILY_PRESENTATION: "impress_pdf_Export",
    1.33 -        FAMILY_DRAWING: "draw_pdf_Export"
    1.34 -    },
    1.35 -    "html": {
    1.36 -        FAMILY_TEXT: "HTML (StarWriter)",
    1.37 -        FAMILY_SPREADSHEET: "HTML (StarCalc)",
    1.38 -        FAMILY_PRESENTATION: "impress_html_Export"
    1.39 -    },
    1.40 -    "odt": { FAMILY_TEXT: "writer8" },
    1.41 -    "doc": { FAMILY_TEXT: "MS Word 97" },
    1.42 -    "rtf": { FAMILY_TEXT: "Rich Text Format" },
    1.43 -    "txt": { FAMILY_TEXT: "Text" },
    1.44 -    "ods": { FAMILY_SPREADSHEET: "calc8" },
    1.45 -    "xls": { FAMILY_SPREADSHEET: "MS Excel 97" },
    1.46 -    "odp": { FAMILY_PRESENTATION: "impress8" },
    1.47 -    "ppt": { FAMILY_PRESENTATION: "MS PowerPoint 97" },
    1.48 -    "swf": { FAMILY_PRESENTATION: "impress_flash_Export" }
    1.49 -}
    1.50 -# see http://wiki.services.openoffice.org/wiki/Framework/Article/Filter
    1.51 -# for more available filters
    1.52 -
    1.53 -
    1.54 -class DocumentConversionException(Exception):
    1.55 -
    1.56 -    def __init__(self, message):
    1.57 -        self.message = message
    1.58 -
    1.59 -    def __str__(self):
    1.60 -        return self.message
    1.61 -
    1.62 -
    1.63 -class DocumentConverter:
    1.64 -    
    1.65 -    def __init__(self, port=DEFAULT_OPENOFFICE_PORT):
    1.66 -        localContext = uno.getComponentContext()
    1.67 -        resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext)
    1.68 -        try:
    1.69 -            context = resolver.resolve("uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext" % port)
    1.70 -        except NoConnectException:
    1.71 -            raise DocumentConversionException, "failed to connect to OpenOffice.org on port %s" % port
    1.72 -        self.desktop = context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context)
    1.73 -
    1.74 -    def convert(self, inputFile, outputFile):
    1.75 -
    1.76 -        inputUrl = self._toFileUrl(inputFile)
    1.77 -        outputUrl = self._toFileUrl(outputFile)
    1.78 -        
    1.79 -        document = self.desktop.loadComponentFromURL(inputUrl, "_blank", 0, self._toProperties(Hidden=True))
    1.80 -        try:
    1.81 -          document.refresh()
    1.82 -        except AttributeError:
    1.83 -          pass
    1.84 -        
    1.85 -        outputExt = self._getFileExt(outputFile)
    1.86 -        filterName = self._filterName(document, outputExt)
    1.87 -
    1.88 -        try:
    1.89 -            document.storeToURL(outputUrl, self._toProperties(FilterName=filterName))
    1.90 -        finally:
    1.91 -            document.close(True)
    1.92 -
    1.93 -    def _filterName(self, document, outputExt):
    1.94 -        family = self._detectFamily(document)
    1.95 -        try:
    1.96 -            filterByFamily = FILTER_MAP[outputExt]
    1.97 -        except KeyError:
    1.98 -            raise DocumentConversionException, "unknown output format: '%s'" % outputExt
    1.99 -        try:
   1.100 -            return filterByFamily[family]
   1.101 -        except KeyError:
   1.102 -            raise DocumentConversionException, "unsupported conversion: from '%s' to '%s'" % (family, outputExt)
   1.103 -    
   1.104 -    def _detectFamily(self, document):
   1.105 -        if document.supportsService("com.sun.star.text.GenericTextDocument"):
   1.106 -            # NOTE: a GenericTextDocument is either a TextDocument, a WebDocument, or a GlobalDocument
   1.107 -            # but this further distinction doesn't seem to matter for conversions
   1.108 -            return FAMILY_TEXT
   1.109 -        if document.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
   1.110 -            return FAMILY_SPREADSHEET
   1.111 -        if document.supportsService("com.sun.star.presentation.PresentationDocument"):
   1.112 -            return FAMILY_PRESENTATION
   1.113 -        if document.supportsService("com.sun.star.drawing.DrawingDocument"):
   1.114 -            return FAMILY_DRAWING
   1.115 -        raise DocumentConversionException, "unknown document family: %s" % document
   1.116 -
   1.117 -    def _getFileExt(self, path):
   1.118 -        ext = splitext(path)[1]
   1.119 -        if ext is not None:
   1.120 -            return ext[1:].lower()
   1.121 -
   1.122 -    def _toFileUrl(self, path):
   1.123 -        return uno.systemPathToFileUrl(abspath(path))
   1.124 -
   1.125 -    def _toProperties(self, **args):
   1.126 -        props = []
   1.127 -        for key in args:
   1.128 -	    prop = PropertyValue()
   1.129 -	    prop.Name = key
   1.130 -	    prop.Value = args[key]
   1.131 -	    props.append(prop)
   1.132 -        return tuple(props)
   1.133 -
   1.134 -
   1.135 -if __name__ == "__main__":
   1.136 -    from sys import argv, exit
   1.137 -    
   1.138 -    if len(argv) < 3:
   1.139 -        print "USAGE: python %s <input-file> <output-file>" % argv[0]
   1.140 -        exit(255)
   1.141 -    if not isfile(argv[1]):
   1.142 -        print "no such input file: %s" % argv[1]
   1.143 -        exit(1)
   1.144 -
   1.145 -    try:
   1.146 -        converter = DocumentConverter()    
   1.147 -        converter.convert(argv[1], argv[2])
   1.148 -    except DocumentConversionException, exception:
   1.149 -        print "ERROR!" + str(exception)
   1.150 -        exit(1)
   1.151 -    except ErrorCodeIOException, exception:
   1.152 -        print "ERROR! ErrorCodeIOException %d" % exception.ErrCode
   1.153 -        exit(1)
   1.154 -
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/grant-oldtext.txt	Sun Apr 12 02:49:55 2009 -0700
     2.3 @@ -0,0 +1,446 @@
     2.4 +
     2.5 +------
     2.6 +
     2.7 +
     2.8 +
     2.9 +Massive new datasets obtained with techniques such as in situ hybridization (ISH) and BAC-transgenics allow the expression levels of many genes at many locations to be compared. This can be used to find marker genes for specific anatomical structures, as well as to draw new anatomical maps. Our goal is to develop automated methods to relate spatial variation in gene expression to anatomy. We have five specific aims:
    2.10 +
    2.11 +(1) develop an algorithm to screen spatial gene expression data for combinations of marker genes which selectively target individual anatomical structures
    2.12 +(2) develop an algorithm to screen spatial gene expression data for combinations of marker genes which can be used to delineate most of the boundaries between a number of anatomical structures at once
    2.13 +(3) develop an algorithm to suggest new ways of dividing a structure up into anatomical subregions, based on spatial patterns in gene expression
    2.14 +(4) create a flat (2-D) map of the mouse cerebral cortex that contains a flattened version of the Allen Mouse Brain Atlas ISH dataset, as well as the boundaries of anatomical areas within the cortex. For each cortical layer, a layer-specific flat dataset will be created. A single combined flat dataset will be created which averages information from all of the layers. These datasets will be made available in both MATLAB and Caret formats.
    2.15 +(5) validate the methods developed in (1), (2) and (3) by applying them to the cerebral cortex datasets created in (4)
    2.16 +
    2.17 +All algorithms that we develop will be implemented in an open-source software toolkit. The toolkit, as well as the machine-readable datasets developed in aim (4) and any other intermediate dataset we produce, will be published and freely available for others to use. 
    2.18 +
    2.19 +In addition to developing generally useful methods, the application of these methods to cerebral cortex will produce immediate benefits that are only one step removed from clinical application, while also supporting the development of new neuroanatomical techniques. The method developed in aim (1) will be applied to each cortical area to find a set of marker genes. Currently, despite the distinct roles of different cortical areas in both normal functioning and disease processes, there are no known marker genes for many cortical areas. Finding marker genes will be immediately useful for drug discovery as well as for experimentation because once marker genes for an area are known, interventions can be designed which selectively target that area. 
    2.20 +
    2.21 +
    2.22 +
    2.23 +
    2.24 +
    2.25 +
    2.26 +
    2.27 +The method developed in aim (2) will be used to find a small panel of genes that can find most of the boundaries between areas in the cortex. Today, finding cortical areal boundaries in a tissue sample is a manual process that requires a skilled human to combine multiple visual cues over a large area of the cortical surface. A panel of marker genes will allow the development of an ISH protocol that will allow experimenters to more easily identify which anatomical areas are present in small samples of cortex.
    2.28 +
    2.29 +
    2.30 +
    2.31 +
    2.32 +
    2.33 +
    2.34 +
    2.35 +
    2.36 +
    2.37 +
    2.38 +
    2.39 +
    2.40 +For each cortical layer, a layer-specific flat dataset will be created. A single combined flat dataset will be created which averages information from all of the layers. These datasets will be made available in both MATLAB and Caret formats.
    2.41 +
    2.42 +
    2.43 +
    2.44 +
    2.45 +
    2.46 +
    2.47 +
    2.48 +
    2.49 +----
    2.50 +
    2.51 +
    2.52 +
    2.53 +New techniques allow the expression levels of many genes at many locations to be compared. It is thought that even neighboring anatomical structures have different gene expression profiles. We propose to develop automated methods to relate the spatial variation in gene expression to anatomy. We will develop two kinds of techniques:
    2.54 +
    2.55 +(a) techniques to screen for combinations of marker genes which selectively target anatomical structures
    2.56 +(b) techniques to suggest new ways of dividing a structure up into anatomical subregions, based on the shapes of contours in the gene expression
    2.57 +
    2.58 +The first kind of technique will be helpful for finding marker genes associated with known anatomical features. The second kind of technique will be helpful in creating new anatomical maps, maps which reflect differences in gene expression the same way that existing maps reflect differences in histology.
    2.59 +
    2.60 +We intend to develop our techniques using the adult mouse cerebral cortex as a testbed. The Allen Brain Atlas has collected a dataset containing the expression level of about 4000 genes* over a set of over 150000 voxels, with a spatial resolution of approximately 200 microns\cite{lein_genome-wide_2007}. 
    2.61 +
    2.62 +We expect to discover sets of marker genes that pick out specific cortical areas. This will allow the development of drugs and other interventions that selectively target individual cortical areas. Therefore our research will lead to application in drug discovery, in the development of other targeted clinical interventions, and in the development of new experimental techniques.
    2.63 +
    2.64 +The best way to divide up rodent cortex into areas has not been completely determined, as can be seen by the differences in the recent maps given by Swanson on the one hand, and Paxinos and Franklin on the other. It is likely that our study, by showing which areal divisions naturally follow from gene expression data, as opposed to traditional histological data, will contribute to the creation of a better map. While we do not here propose to analyze human gene expression data, it is conceivable that the methods we propose to develop could be used to suggest modifications to the human cortical map as well.  
    2.65 +
    2.66 +
    2.67 +In the following, we will only be talking about coronal data.
    2.68 +
    2.69 +The Allen Brain Atlas provides "Smoothed Energy Volumes", which are 
    2.70 +
    2.71 +
    2.72 +One type of artifact in the Allen Brain Atlas data is what we call a "slice artifact". We have noticed two types of slice artifacts in the dataset. The first type, a "missing slice artifact", occurs when the ISH procedure on a slice did not come out well. In this case, the Allen Brain investigators excluded the slice at issue from the dataset. This means that no gene expression information is available for that gene for the region of space covered by that slice. This results in an expression level of zero being assigned to voxels covered by the slice. This is partially but not completely ameliorated by the smoothing that is applied to create the Smoothed Energy Volumes. The usual end result is that a region of space which is shaped and oriented like a coronal slice is marked as having less gene expression than surrounding regions.
    2.73 +
    2.74 +The second type of slice artifact is caused by the fact that all of the slices have a consistent orientation. Since there may be artifacts (such as how well the ISH worked) which are constant within each slice but which vary between different slices, the result is that ceteris paribus, when one compares the genetic data of a voxel to another voxel within the same coronal plane, one would expect to find more similarity than if one compared a voxel to another voxel displaced along the rostrocaudal axis.
    2.75 +
    2.76 +
    2.77 +
    2.78 +
    2.79 +We are enthusiastic about the sharing of methods, data, and results, and at the conclusion of the project, we will make all of our data and computer source code publically available. Our goal is that replicating our results, or applying the methods we develop to other targets, will be quick and easy for other investigators. In order to aid in understanding and replicating our results, we intend to include a software program which, when run, will take as input the Allen Brain Atlas raw data, and produce as output all numbers and charts found in publications resulting from the project.
    2.80 +
    2.81 +
    2.82 +To aid in the replication of our results, we will include a script which takes as input the dataset in aim (3) and provides as output all of the tables in figures in our publications .
    2.83 +
    2.84 +
    2.85 +
    2.86 +
    2.87 +We also expect to weigh in on the debate about how to best partition rodent cortex
    2.88 +
    2.89 +
    2.90 +
    2.91 +be useful for drug discovery as well 
    2.92 +
    2.93 +
    2.94 +
    2.95 +* Another 16000 genes are available, but they do not cover the entire cerebral cortex with high spatial resolution.
    2.96 +
    2.97 +
    2.98 +User-definable ROIs
    2.99 +Combinatorial gene expression
   2.100 +Negative as well as positive signal
   2.101 +Use geometry
   2.102 +Search for local boundaries if necessary
   2.103 +Flatmapped
   2.104 +
   2.105 +
   2.106 +
   2.107 +
   2.108 +
   2.109 +
   2.110 +== Specific aims ==
   2.111 +
   2.112 +==== Develop algorithms that find genetic markers for anatomical regions ====
   2.113 +# Develop scoring measures for evaluating how good individual genes are at marking areas: we will compare pointwise, geometric, and information-theoretic measures.
   2.114 +# Develop a procedure to find single marker genes for anatomical regions: for each cortical area, by using or combining the scoring measures developed, we will rank the genes by their ability to delineate each area. 
   2.115 +# Extend the procedure to handle difficult areas by using combinatorial coding: for areas that cannot be identified by any single gene, identify them with a handful of genes. We will consider both (a) algorithms that incrementally/greedily combine single gene markers into sets, such as forward stepwise regression and decision trees, and also (b) supervised learning techniques which use soft constraints to minimize the number of features, such as sparse support vector machines.
   2.116 +# Extend the procedure to handle difficult areas by combining or redrawing the boundaries: An area may be difficult to identify because the boundaries are misdrawn, or because it does not "really" exist as a single area, at least on the genetic level. We will develop extensions to our procedure which (a) detect when a difficult area could be fit if its boundary were redrawn slightly, and (b) detect when a difficult area could be combined with adjacent areas to create a larger area which can be fit.
   2.117 +
   2.118 +
   2.119 +==== Apply these algorithms to the cortex ====
   2.120 +# Create open source format conversion tools: we will create tools to bulk download the ABA dataset and to convert between SEV, NIFTI and MATLAB formats.
   2.121 +# Flatmap the ABA cortex data: map the ABA data onto a plane and draw the cortical area boundaries onto it.
   2.122 +# Find layer boundaries: cluster similar voxels together in order to automatically find the cortical layer boundaries.
   2.123 +# Run the procedures that we developed on the cortex: we will present, for each area, a short list of markers to identify that area; and we will also present lists of "panels" of genes that can be used to delineate many areas at once.
   2.124 +
   2.125 +==== Develop algorithms to suggest a division of a structure into anatomical parts ====
   2.126 +# Explore dimensionality reduction algorithms applied to pixels: including TODO
   2.127 +# Explore dimensionality reduction algorithms applied to genes: including TODO
   2.128 +# Explore clustering algorithms applied to pixels: including TODO
   2.129 +# Explore clustering algorithms applied to genes: including gene shaving, TODO
   2.130 +# Develop an algorithm to use dimensionality reduction and/or hierarchial clustering to create anatomical maps
   2.131 +# Run this algorithm on the cortex: present a hierarchial, genoarchitectonic map of the cortex
   2.132 +
   2.133 +
   2.134 +
   2.135 +
   2.136 +
   2.137 +
   2.138 +
   2.139 +
   2.140 +
   2.141 +gradient similarity is calculated as:
   2.142 +\sum_pixels cos(abs(\angle \nabla_1 - \angle \nabla_2)) \cdot \frac{\vert \nabla_1 \vert + \vert \nabla_2 \vert}{2}  \cdot \frac{pixel\_value_1 + pixel\_value_2}{2}
   2.143 +
   2.144 +
   2.145 +
   2.146 +
   2.147 +
   2.148 +
   2.149 +
   2.150 +(todo) Technically, we say that an anatomical structure has a fundamentally 2-D organization when there exists a commonly used, generic, anatomical structure-preserving map from 3-D space to a 2-D manifold.  
   2.151 +
   2.152 +
   2.153 +Related work:
   2.154 +
   2.155 +
   2.156 +The Allen Brain Institute has developed an interactive web interface called AGEA which allows an investigator to (1) calculate lists of genes which are selectively overexpressed in certain anatomical regions (ABA calls this the "Gene Finder" function) (2) to visualize the correlation between the genetic profiles of voxels in the dataset, and (3) to visualize a hierarchial clustering of voxels in the dataset \cite{ng_anatomic_2009}. AGEA is an impressive and useful tool, however, it does not solve the same problems that we propose to solve with this project.
   2.157 +
   2.158 +First we describe AGEA's "Gene Finder", and then compare it to our proposed method for finding marker genes. AGEA's Gene Finder first asks the investigator to select a single "seed voxel" of interest. It then uses a clustering method, combined with built-in knowledge of major anatomical structures, to select two sets of voxels; an "ROI" and a "comparator region"*. The seed voxel is always contained within the ROI, and the ROI is always contained within the comparator region. The comparator region is similar but not identical to the set of voxels making up the major anatomical region containing the ROI. Gene Finder then looks for genes which can distinguish the ROI from the comparator region. Specifically, it finds genes for which the ratio (expression energy in the ROI) / (expression energy in the comparator region) is high. 
   2.159 +
   2.160 +Informally, the Gene Finder first infers an ROI based on clustering the seed voxel with other voxels. Then, the Gene Finder finds genes which overexpress in the ROI as compared to other voxels in the major anatomical region. 
   2.161 +
   2.162 +There are three major differences between our approach and Gene Finder. 
   2.163 +
   2.164 +First, Gene Finder focuses on individual genes and individual ROIs in isolation. This is great for regions which can be picked out from all other regions by a single gene, but not all of them can (todo). There are at least two ways this can miss out on useful genes. First, a gene might express in part of a region, but not throughout the whole region, but there may be another gene which expresses in the rest of the region*. Second, a gene might express in a region, but not in any of its neighbors, but it might express also in other non-neighboring regions. To take advantage of these types of genes, we propose to find combinations of genes which, together, can identify the boundaries of all subregions within the containing region.
   2.165 +
   2.166 +Second, Gene Finder uses a pointwise metric, namely expression energy ratio, to decide whether a gene is good for picking out a region. We have found better results by using metrics which take into account not just single voxels, but also the local geometry of neighboring voxels, such as the local gradient (todo). In addition, we have found that often the absence of gene expression can be used as a marker, which will not be caught by Gene Finder's expression energy ratio (todo).
   2.167 +
   2.168 +Third, Gene Finder chooses the ROI based only on the seed voxel. This often does not permit the user to query the ROI that they are interested in. For example, in all of our tests of Gene Finder in cortex, the ROIs chosen tend to be cortical layers, rather than cortical areas.
   2.169 +
   2.170 +In summary, when Gene Finder picks the ROI that you want, and when this ROI can be easily picked out from neighboring regions by single genes which selectively overexpress in the ROI compared to the entire major anatomical region, Gene Finder will work. However, Gene Finder will not pick cortical areas as ROIs, and even if it could, many cortical areas cannot be uniquely picked out by the overexpression of any single gene. By contrast, we will target cortical areas, we will explore a variety of metrics which can complement the shortcomings of expression energy ratio, and we will use the combinatorial expression of genes to pick out cortical areas even when no individual gene will do.
   2.171 +
   2.172 +
   2.173 +* The terms "ROI" and "comparator region" are our own; the ABI calls them the "local region" and the "larger anatomical context". The ABI uses the term "specificity comparator" to mean the major anatomic region containing the ROI, which is not exactly identical to the comparator region.
   2.174 +
   2.175 +** In this case, the union of the area of expression of the two genes would suffice; one could also imagine that there could be situations in which the intersection of multiple genes would be needed, or a combination of unions and intersections.
   2.176 +
   2.177 +
   2.178 +Now we describe AGEA's hierarchial clustering, and compare it to our proposal. The goal of AGEA's hierarchial clustering is to generate a binary tree of clusters, where a cluster is a collection of voxels. AGEA begins by computing the Pearson correlation between each pair of voxels. They then employ a recursive divisive (top-down) hierarchial clustering procedure on the voxels, which means that they start with all of the voxels, and then they divide them into clusters, and then within each cluster, they divide that cluster into smaller clusters, etc***. At each step, the collection of voxels is partitioned into two smaller clusters in a way that maximizes the following quantity: average correlation between all possible pairs of voxels containing one voxel from each cluster. 
   2.179 +
   2.180 +There are three major differences between our approach and AGEA's hierarchial clustering. First, AGEA's clustering method separates cortical layers before it separates cortical areas. 
   2.181 +
   2.182 +
   2.183 +
   2.184 +
   2.185 +
   2.186 +following procedure is used for the purpose of dividing a collection of voxels into smaller clusters: partition the voxels into two sets, such that the following quantity is maximized: 
   2.187 +
   2.188 +*** depending on which level of the tree is being created, the voxels are subsampled in order to save time
   2.189 +
   2.190 +
   2.191 +
   2.192 +
   2.193 +
   2.194 +does not allow the user to input anything other than a seed voxel; this means that for each seed voxel, there is only one 
   2.195 +
   2.196 +
   2.197 +
   2.198 +The role of the "local region" is to serve as a region of interest for which marker genes are desired; the role of the "larger anatomical context" is to be the structure 
   2.199 +
   2.200 +
   2.201 +
   2.202 +There are two kinds of differences between AGEA and our project; differences that relate to the treatment of the cortex, and differences in the type of generalizable methods being developed. As relates 
   2.203 +
   2.204 +
   2.205 +indicate an ROI  
   2.206 +
   2.207 +explore simple correlation-based relationships between voxels, genes, and clusters of voxels. 
   2.208 +
   2.209 +
   2.210 +There have not yet been any studies which describe the results of applying AGEA to the cerebral cortex; however, we suspect that the AGEA metrics are not optimal for the task of relating genes to cortical areas. A voxel's gene expression profile depends upon both its cortical area and its cortical layer, however, AGEA has no mechanism to distinguish these two. As a result, voxels in the same layer but different areas are often clustered together by AGEA. As part of the project, we will compare the performance of our techniques against AGEA's.
   2.211 +
   2.212 +---
   2.213 +
   2.214 +The Allen Brain Institute has developed interactive tools called AGEA which allow an investigator to explore simple correlation-based relationships between voxels, genes, and clusters of voxels. There have not yet been any studies which describe the results of applying AGEA to the cerebral cortex; however, we suspect that the AGEA metrics are not optimal for the task of relating genes to cortical areas. A voxel's gene expression profile depends upon both its cortical area and its cortical layer, however, AGEA has no mechanism to distinguish these two. As a result, voxels in the same layer but different areas are often clustered together by AGEA. As part of the project, we will compare the performance of our techniques against AGEA's.
   2.215 +
   2.216 +Another difference between our techniques and AGEA's is that AGEA allows the user to enter only a voxel location, and then to either explore the rest of the brain's relationship to that particular voxel, or explore a partitioning of the brain based on pairwise voxel correlation. If the user is interested not in a single voxel, but rather an entire anatomical structure, AGEA will only succeed to the extent that the selected voxel is a typical representative of the structure. As discussed in the previous paragraph, this poses problems for structures like cortical areas, which (because of their division into cortical layers) do not have a single "typical representative".
   2.217 +
   2.218 +By contrast, in our system, the user will start by selecting, not a single voxel, but rather, an anatomical superstructure to be divided into pieces (for example, the cerebral cortex). We expect that our methods will take into account not just pairwise statistics between voxels, but also large-scale geometric features (for example, the rapidity of change in gene expression as regional boundaries are crossed) which optimize the discriminability of regions within the selected superstructure.
   2.219 +
   2.220 +
   2.221 +-----
   2.222 +
   2.223 +screen for combinations of marker genes which selectively target anatomical structures
   2.224 +pick delineate the boundaries between neighboring anatomical structures.
   2.225 +(b) techniques to screen for marker genes which pick out anatomical structures of interest
   2.226 +
   2.227 +, techniques which: (a) screen for marker genes , and (b) suggest new anatomical maps based on 
   2.228 +
   2.229 +
   2.230 +whose expression partitions the region of interest into its anatomical substructures, and (b) use the natural contours of gene expression to suggest new ways of dividing an organ into 
   2.231 +
   2.232 +
   2.233 +The Allen Brain Atlas
   2.234 +
   2.235 +
   2.236 +
   2.237 +
   2.238 +--
   2.239 +
   2.240 +to: brooksl@mail.nih.gov
   2.241 +
   2.242 +Hi, I'm writing to confirm the applicability of a potential research
   2.243 +project to the challenge grant topic "New computational and
   2.244 +statistical methods for the analysis of large
   2.245 +data sets from next-generation sequencing technologies".
   2.246 +
   2.247 +We want to develop methods for the analysis of gene expression
   2.248 +datasets that can be used to uncover the relationships between gene
   2.249 +expression and anatomical regions. Specifically, we want to develop
   2.250 +techniques to (a) given a set of known anatomical areas, identify
   2.251 +genetic markers for each of these areas, and (b) given an anatomical structure
   2.252 +whose substructure is unknown, suggest a map, that is, a division of
   2.253 +the space into anatomical sub-structures, that represents the
   2.254 +boundaries inherent in the gene expression data.
   2.255 +
   2.256 +We propose to develop our techniques on the Allen Brain
   2.257 +Atlas mouse brain gene expression dataset by finding genetic markers
   2.258 +for anatomical areas within the cerebral cortex. The Allen Brain Atlas
   2.259 +contains a registered 3-D map of gene expression data with 200-micron
   2.260 +voxel resolution which was created from in situ hybridization
   2.261 +data. The dataset contains about 4000 genes which are available at
   2.262 +this resolution across the entire cerebral cortex.
   2.263 +
   2.264 +Despite the distinct roles of different cortical
   2.265 +areas in both normal functioning and disease processes, there are no
   2.266 +known marker genes for many cortical areas. This project will be
   2.267 +immediately useful for both drug discovery and clinical research
   2.268 +because once the markers are known, interventions can be designed
   2.269 +which selectively target specific cortical areas.
   2.270 +
   2.271 +This techniques we develop will be useful because they will be
   2.272 +applicable to the analysis of other anatomical areas, both in
   2.273 +terms of finding marker genes for known areas, and in terms of
   2.274 +suggesting new anatomical subdivisions that are based upon the gene
   2.275 +expression data.
   2.276 +
   2.277 +
   2.278 +
   2.279 +----
   2.280 +
   2.281 +
   2.282 +
   2.283 +
   2.284 +
   2.285 +
   2.286 +It is likely that our study, by showing which areal divisions naturally follow from gene expression data, as opposed to traditional histological data, will contribute to the creation of
   2.287 +
   2.288 +there are clear genetic or chemical markers known for only a few cortical areas. This makes it difficult to target drugs to specific
   2.289 +
   2.290 +As part of aims (1) and (5), we will discover sets of marker genes that pick out specific cortical areas. This will allow the development of drugs and other interventions that selectively target individual cortical areas. As part of aims (2) and (5), we will also discover small panels of marker genes that can be used to delineate most of the cortical areal map. 
   2.291 +
   2.292 +
   2.293 +
   2.294 +With aims (2) and (4), we
   2.295 +
   2.296 +There are five principals
   2.297 +
   2.298 +
   2.299 +
   2.300 +In addition to validating the usefulness of the algorithms, the application of these methods to cerebral cortex will produce immediate benefits that are only one step removed from clinical application. 
   2.301 +
   2.302 +
   2.303 +todo: remember to check gensat, etc for validation (mention bias/variance)
   2.304 +
   2.305 +
   2.306 +
   2.307 +=== Why it is useful to apply these methods to cortex ===
   2.308 +
   2.309 +
   2.310 +There is still room for debate as to exactly how the cortex should be parcellated into areas. 
   2.311 +
   2.312 +
   2.313 +The best way to divide up rodent cortex into areas has not been completely determined, 
   2.314 +
   2.315 +
   2.316 +not yet been accounted for in 
   2.317 +
   2.318 + that the expression of some genes will contain novel spatial patterns which are not account
   2.319 +
   2.320 + that a genoarchitectonic map
   2.321 +
   2.322 +
   2.323 +This principle is only applicable to aim 1 (marker genes). For aim 2 (partition a structure in into anatomical subregions), we plan to work with many genes at once. 
   2.324 +
   2.325 +
   2.326 +tood: aim 2 b+s?
   2.327 +
   2.328 +
   2.329 +
   2.330 +
   2.331 +==== Principle 5: Interoperate with existing tools ====
   2.332 +
   2.333 +In order for our software to be as useful as possible for our users, it will be able to import and export data to standard formats so that users can use our software in tandem with other software tools created by other teams. We will support the following formats: NIFTI (Neuroimaging Informatics Technology Initiative), SEV (Allen Brain Institute Smoothed Energy Volume), and MATLAB. This ensures that our users will not have to exclusively rely on our tools when analyzing data. For example, users will be able to use the data visualization and analysis capabilities of MATLAB and Caret alongside our software.
   2.334 +
   2.335 +To our knowledge, there is no currently available software to convert between these formats, so we will also provide a format conversion tool. This may be useful even for groups that don't use any of our other software.
   2.336 +
   2.337 +
   2.338 +
   2.339 +todo: is "marker gene" even a phrase that we should use at all?
   2.340 +
   2.341 +
   2.342 +
   2.343 +note for aim 1 apps: combo of genes is for voxel, not within any single cell
   2.344 +
   2.345 +
   2.346 +
   2.347 +
   2.348 +, as when genetic markers allow the development of selective interventions; the reason that one can be confident that the intervention is selective is that it is only turned on when a certain combination of genes is turned on and off. The result procedure is what assures us that when that combination is present, the local tissue is probably part of a certain subregion. 
   2.349 +
   2.350 +
   2.351 +
   2.352 +The basic idea is that we want to find a procedure by 
   2.353 +
   2.354 +The task of finding genes that mark anatomical areas can be phrased in terms of what the field of machine learning calls a "supervised learning" task. The goal of this task is to learn a function (the "classifier") which
   2.355 +
   2.356 +If a person knows a combination of genes that mark an area, that implies that the person can be told how strong those genes express in any voxel, and the person can use this information to determine how 
   2.357 +
   2.358 +finding how to infer the areal identity of a voxel if given the gene expression profile of that voxel.
   2.359 +
   2.360 + 
   2.361 +For each voxel in the cortex, we want to start with data about the gene expression 
   2.362 +
   2.363 +
   2.364 +
   2.365 +There are various ways to look for marker genes. We will define some terms, and along the way we will describe a few design choices encountered in the process of creating a marker gene finding method, and then we will present four principles that describe which options we have chosen.
   2.366 +
   2.367 +In developing a procedure for finding marker genes, we are developing a procedure that takes a dataset of experimental observations and produces a result. One can think of the result as merely a list of genes, but really the result is an understanding of a predictive relationship between, on the one hand, the expression levels of genes, and, on the other hand, anatomical subregions.
   2.368 +
   2.369 +One way to more formally define this understanding is to look at it as a procedure. In this view, the result of the learning procedure is itself a procedure. The result procedure provides a way to use the gene expression profiles of voxels in a tissue sample in order to determine where the subregions are.
   2.370 +
   2.371 +This result procedure can be used directly, as when an experimenter has a tissue sample and needs to know what subregions are present in it, and, if multiple subregions are present, where they each are. Or it can be used indirectly; imagine that the result procedure tells us that whenever a certain combination of genes are expressed, the local tissue is probably part of a certain subregion. This means that we can then confidentally develop an intervention which is triggered only when that combination of genes are expressed; and to the extent that the result procedure is reliable, we know that the intervention will only be triggered in the target subregion.
   2.372 +
   2.373 +We said that the result procedure provides "a way to use the gene expression profiles of voxels in a tissue sample" in order to "determine where the subregions are". 
   2.374 +
   2.375 +
   2.376 +Does the result procedure get as input all of the gene expression profiles of each voxel in the entire tissue sample, and produce as output all of the subregional boundaries all at once? 
   2.377 +
   2.378 +
   2.379 +
   2.380 +
   2.381 +
   2.382 +
   2.383 + it is helpful for the classifier to look at the global "shape" of gene expression patterns over the whole structure, rather than just nearby voxels.
   2.384 +
   2.385 +
   2.386 +
   2.387 +
   2.388 +there is some small bit of additional information that can be gleaned from knowing the 
   2.389 +
   2.390 +==== Design choices for a supervised learning procedure ====
   2.391 +
   2.392 +
   2.393 +After all, 
   2.394 +
   2.395 +there is a small correlation between the gene expression levels from distant voxels and
   2.396 +
   2.397 +Depending on how we intend to use the classifier, we may want to design it so that
   2.398 +
   2.399 +It is possible for many things to 
   2.400 +
   2.401 +The choice of which data is made part of an instance 
   2.402 +
   2.403 +what we seek is a procedure 
   2.404 +
   2.405 + partition the tissue sample into subregions. 
   2.406 +
   2.407 + each part of the anatomical structure 
   2.408 +
   2.409 + must be One way to rephrase this task is to say that, instead of searching for the location of the subregions, we are looking to partition the tissue sample into subregions. 
   2.410 +
   2.411 +
   2.412 +There are various ways to look for marker genes. We will define some terms, and along the way we will describe a few design choices encountered in the process of creating a marker gene finding method, and then we will present four principles that describe which options we have chosen.
   2.413 +
   2.414 +In developing a procedure for finding marker genes, we are developing a procedure that takes a dataset of experimental observations and produces a result. One can think of the result as merely a list of genes, but really the result is an understanding of a predictive relationship between, on the one hand, the expression levels of genes, and, on the other hand, anatomical subregions.
   2.415 +
   2.416 +One way to more formally define this understanding is to look at it as a procedure. In this view, the result of the learning procedure is itself a procedure. The result procedure provides a way to use the gene expression profiles of voxels in a tissue sample in order to determine where the subregions are.
   2.417 +
   2.418 +This result procedure can be used directly, as when an experimenter has a tissue sample and needs to know what subregions are present in it, and, if multiple subregions are present, where they each are. Or it can be used indirectly; imagine that the result procedure tells us that whenever a certain combination of genes are expressed, the local tissue is probably part of a certain subregion. This means that we can then confidentally develop an intervention which is triggered only when that combination of genes are expressed; and to the extent that the result procedure is reliable, we know that the intervention will only be triggered in the target subregion.
   2.419 +
   2.420 +We said that the result procedure provides "a way to use the gene expression profiles of voxels in a tissue sample" in order to "determine where the subregions are". 
   2.421 +
   2.422 +
   2.423 +Does the result procedure get as input all of the gene expression profiles of each voxel in the entire tissue sample, and produce as output all of the subregional boundaries all at once? 
   2.424 +
   2.425 +
   2.426 +Or are we given one voxel at a time, 
   2.427 +
   2.428 +
   2.429 +In the jargon of the field of machine learning, the result procedure is called a __classifier__.
   2.430 +
   2.431 +
   2.432 +The task of finding genes that mark anatomical areas can be phrased in terms of what the field of machine learning calls a "supervised learning" task. The goal of this task is to learn a function (the "classifier") which
   2.433 +
   2.434 +If a person knows a combination of genes that mark an area, that implies that the person can be told how strong those genes express in any voxel, and the person can use this information to determine how 
   2.435 +
   2.436 +finding how to infer the areal identity of a voxel if given the gene expression profile of that voxel.
   2.437 +
   2.438 + 
   2.439 +For each voxel in the cortex, we want to start with data about the gene expression 
   2.440 +
   2.441 +
   2.442 +
   2.443 +single voxels, but rather groups of voxels, such that the groups can be placed in some 2-D space. We will call such instances "pixels".
   2.444 +
   2.445 +We have been speaking as if instances necessarily correspond to single voxels. But it is possible for instances to be groupings of many voxels, in which case each grouping must be assigned the same label (that is, each voxel grouping must stay inside a single anatomical subregion).
   2.446 +
   2.447 +
   2.448 +
   2.449 +In some but not all cases, the groups are either rows or columns of voxels. This is the case with the cerebral cortex, in which one may assume that columns of voxels which run perpendicular to the cortical surface all share the same areal identity. In the cortex, we call such an instance a "surface pixel", because such an instance represents the data associated with all voxels underneath a specific patch of the cortical surface. 
     3.1 Binary file grant.doc has changed
     4.1 --- a/grant.html	Sat Apr 11 21:43:12 2009 -0700
     4.2 +++ b/grant.html	Sun Apr 12 02:49:55 2009 -0700
     4.3 @@ -1,6 +1,5 @@
     4.4  Specific aims
     4.5 -            todo test4
     4.6 -               Massive new datasets obtained with techniques such as in situ hybridization
     4.7 +            Massive new datasets obtained with techniques such as in situ hybridization
     4.8              (ISH) and BAC-transgenics allow the expression levels of many genes at many
     4.9              locations to be compared. Our goal is to develop automated methods to relate
    4.10              spatial variation in gene expression to anatomy. We want to find marker genes
    4.11 @@ -25,7 +24,7 @@
    4.12              in aim (3), will be published and freely available for others to use.
    4.13               Background and significance
    4.14               Aim 1
    4.15 -             Machine learning terminology
    4.16 +             Machine learning terminology: supervised learning
    4.17              The task of looking for marker genes for anatomical subregions means that one
    4.18              is looking for a set of genes such that, if the expression level of those genes is
    4.19              known, then the locations of the subregions can be inferred.
    4.20 @@ -35,15 +34,15 @@
    4.21              determine to which subregion each voxel within the structure belongs. We call
    4.22              this a classification task, because each voxel is being assigned to a class (namely,
    4.23              its subregion).
    4.24 -                                            1
    4.25 -
    4.26                 Therefore, an understanding of the relationship between the combination of
    4.27              their expression levels and the locations of the subregions may be expressed as
    4.28 +                                            1
    4.29 +
    4.30              a function. The input to this function is a voxel, along with the gene expression
    4.31              levels within that voxel;  the output is the subregional identity of the target
    4.32              voxel, that is, the subregion to which the target voxel belongs.  We call this
    4.33              function a classifier.  In general, the input to a classifier is called an instance,
    4.34 -            and the output is called a label.
    4.35 +            and the output is called a label (or a class label).
    4.36                 The object of aim 1 is not to produce a single classifier, but rather to develop
    4.37              an automated method for determining a classifier for any known anatomical
    4.38              structure.  Therefore, we seek a procedure by which a gene expression dataset
    4.39 @@ -80,11 +79,11 @@
    4.40                 Key questions when choosing a learning method are: What are the instances?
    4.41              What are the features?  How are the features chosen?  Here are four principles
    4.42              that outline our answers to these questions.
    4.43 -                                            2
    4.44 -
    4.45               Principle 1: Combinatorial gene expression
    4.46              Above, we defined an “instance” as the combination of a voxel with the “asso-
    4.47              ciated gene expression data”.  In our case this refers to the expression level of
    4.48 +                                            2
    4.49 +
    4.50              genes within the voxel, but should we include the expression levels of all genes,
    4.51              or only a few of them?
    4.52                 It is too much to hope that every anatomical region of interest will be iden-
    4.53 @@ -125,7 +124,98 @@
    4.54                                              3
    4.55  
    4.56               Aim 2
    4.57 -            todo
    4.58 +             Machine learning terminology: clustering
    4.59 +            If one is given a dataset consisting merely of instances, with no class labels, then
    4.60 +            analysis of the dataset is referred to as unsupervised learning in the jargon of
    4.61 +            machine learning.  One thing that you can do with such a dataset is to group
    4.62 +            instances together. A set of similar instances is called a cluster, and the activity
    4.63 +            of finding grouping the data into clusters is called clustering or cluster analysis.
    4.64 +               The task of deciding how to carve up a structure into anatomical subregions
    4.65 +            can be put into these terms.  The instances are once again voxels (or pixels)
    4.66 +            along with their associated gene expression profiles.  We make the assumption
    4.67 +            that voxels from the same subregion have similar gene expression profiles, at
    4.68 +            least compared to the other subregions.  This means that clustering voxels is
    4.69 +            the same as finding potential subregions; we seek a partitioning of the voxels
    4.70 +            into subregions, that is, into clusters of voxels with similar gene expression.
    4.71 +               It is desirable to determine not just one set of subregions,  but also how
    4.72 +            these subregions relate to each other, if at all; perhaps some of the subregions
    4.73 +            are more similar to each other than to the rest, suggesting that, although at a
    4.74 +            fine spatial scale they could be considered separate, on a coarser spatial scale
    4.75 +            they could be grouped together into one large subregion.  This suggests the
    4.76 +            outcome of clustering may be a hierarchial tree of clusters, rather than a single
    4.77 +            set of clusters which partition the voxels. This is called hierarchial clustering.
    4.78 +             Similarity scores
    4.79 +            todo
    4.80 +             Spatially contiguous clusters; image segmentation
    4.81 +            We have shown that aim 2 is a type of clustering task. In fact, it is a special type
    4.82 +            of clustering task because we have an additional constraint on clusters; voxels
    4.83 +            grouped together into a cluster must be spatially contiguous.  In Preliminary
    4.84 +            Results,  we show that one can get reasonable results without enforcing this
    4.85 +            constraint, however, we plan to compare these results against other methods
    4.86 +            which guarantee contiguous clusters.
    4.87 +               Perhaps the biggest source of continguous clustering algorithms is the field
    4.88 +            of computer vision, which has produced a variety of image segmentation algo-
    4.89 +            rithms.  Image segmentation is the task of partitioning the pixels in a digital
    4.90 +            image into clusters, usually contiguous clusters.  Aim 2 is similar to an image
    4.91 +            segmentation task. There are two main differences; in our task, there are thou-
    4.92 +            sands of color channels (one for each gene), rather than just three.  There are
    4.93 +            imaging tasks which use more than three colors, however, for example multispec-
    4.94 +            tral imaging and hyperspectral imaging, which are often used to process satellite
    4.95 +            imagery. A more crucial difference is that there are various cues which are ap-
    4.96 +            propriate for detecting sharp object boundaries in a visual scene but which are
    4.97 +            not appropriate for segmenting abstract spatial data such as gene expression.
    4.98 +                                            4
    4.99 +
   4.100 +            Although many image segmentation algorithms can be expected to work well
   4.101 +            for segmenting other sorts of spatially arranged data, some of these algorithms
   4.102 +            are specialized for visual images.
   4.103 +             Dimensionality reduction
   4.104 +            Unlike aim 1, there is no externally-imposed need to select only a handful of
   4.105 +            informative genes for inclusion in the instances.  However, some clustering al-
   4.106 +            gorithms perform better on small numbers of features.  There are techniques
   4.107 +            which “summarize” a larger number of features using a smaller number of fea-
   4.108 +            tures; these techniques go by the name of feature extraction or dimensionality
   4.109 +            reduction.  The small set of features that such a technique yields is called the
   4.110 +            reduced feature set. After the reduced feature set is created, the instances may
   4.111 +            be replaced by reduced instances, which have as their features the reduced fea-
   4.112 +            ture set rather than the original feature set of all gene expression levels.  Note
   4.113 +            that the features in the reduced feature set do not necessarily correspond to
   4.114 +            genes; each feature in the reduced set may be any function of the set of gene
   4.115 +            expression levels.
   4.116 +               Another use for dimensionality reduction is to visualize the relationships
   4.117 +            between subregions.  For example, one might want to make a 2-D plot upon
   4.118 +            which each subregion is represented by a single point, and with the property
   4.119 +            that subregions with similar gene expression profiles should be nearby on the
   4.120 +            plot (that is, the property that distance between pairs of points in the plot
   4.121 +            should be proportional to some measure of dissimilarity in gene expression). It
   4.122 +            is likely that no arrangement of the points on a 2-D plan will exactly satisfy
   4.123 +            this property – however, dimensionality reduction techniques allow one to find
   4.124 +            arrangements of points that approximately satisfy that property.   Note that
   4.125 +            in this application, dimensionality reduction is being applied after clustering;
   4.126 +            whereas in the previous paragraph, we were talking about using dimensionality
   4.127 +            reduction before clustering.
   4.128 +             Clustering genes rather than voxels
   4.129 +            Although the ultimate goal is to cluster the instances (voxels or pixels), one
   4.130 +            strategy to achieve this goal is to first cluster the features (genes).  There are
   4.131 +            two ways that clusters of genes could be used.
   4.132 +               Gene clusters could be used as part of dimensionality reduction: rather than
   4.133 +            have one feature for each gene, we could have one reduced feature for each gene
   4.134 +            cluster.
   4.135 +               Gene clusters could also be used to directly yield a clustering on instances.
   4.136 +            This is because many genes have an expression pattern which seems to pick
   4.137 +            out a single, spatially continguous subregion. Therefore, it seems likely that an
   4.138 +            anatomically interesting subregion will have multiple genes which each individ-
   4.139 +            ually pick it out1. This suggests the following procedure: cluster together genes
   4.140 +__________________________
   4.141 +   1This would seem to contradict our finding in aim 1 that some cortical areas are combina-
   4.142 +torially coded by multiple genes.  However, it is possible that the currently accepted cortical
   4.143 +                                            5
   4.144 +
   4.145 +            which pick out similar subregions, and then to use the more popular common
   4.146 +            subregions as the final clusters. In the Preliminary Data we show that a num-
   4.147 +            ber of anatomically recognized cortical regions, as well as some “superregions”
   4.148 +            formed by lumping together a few regions, are associated with gene clusters in
   4.149 +            this fashion.
   4.150               Aim 3
   4.151               Background
   4.152              The cortex is divided into areas and layers.  To a first approximation, the par-
   4.153 @@ -160,13 +250,17 @@
   4.154              marker genes will allow the development of an ISH protocol that will allow
   4.155              experimenters to more easily identify which anatomical areas are present in
   4.156              small samples of cortex.
   4.157 +______
   4.158 +maps divide the cortex into subregions which are unnatural from the point of view of gene
   4.159 +expression; perhaps there is some other way to map the cortex for which each subregion can
   4.160 +be identified by single genes.
   4.161 +                                            6
   4.162 +
   4.163                 The method developed in aim (3) will provide a genoarchitectonic viewpoint
   4.164              that will contribute to the creation of a better map. The development of present-
   4.165              day cortical maps was driven by the application of histological stains.   It is
   4.166              conceivable that if a different set of stains had been available which identified
   4.167              a different set of features, then the today’s cortical maps would have come out
   4.168 -                                            4
   4.169 -
   4.170              differently. Since the number of classes of stains is small compared to the number
   4.171              of genes, it is likely that there are many repeated, salient spatial patterns in
   4.172              the gene expression which have not yet been captured by any stain. Therefore,
   4.173 @@ -177,37 +271,30 @@
   4.174              modifications to the human cortical map as well.
   4.175               Related work
   4.176              todo
   4.177 +               vs. AGEA – i wrote something on this but i’m going to rewrite it
   4.178               Preliminary work
   4.179 -             Justification of principles 1 thur 3
   4.180 -             Principle 1: Combinatorial gene expression
   4.181 +             Format conversion between SEV, MATLAB, NIFTI
   4.182 +            todo
   4.183 +             Flatmap of cortex
   4.184 +            todo
   4.185 +             Using combinations of multiple genes is necessary and sufficient to
   4.186 +            delineate some cortical areas
   4.187              Here we give an example of a cortical area which is not marked by any single
   4.188              gene, but which can be identified combinatorially.  according to logistic regres-
   4.189 -            sion, gene wwc11 is the best fit single gene for predicting whether or not a pixel
   4.190 +            sion, gene wwc12 is the best fit single gene for predicting whether or not a pixel
   4.191              on the cortical surface belongs to the motor area (area MO). The upper-left
   4.192              picture in Figure  shows wwc1’s spatial expression pattern over the cortex. The
   4.193              lower-right boundary of MO is represented reasonably well by this gene, however
   4.194              the gene overshoots the upper-left boundary. This flattened 2-D representation
   4.195              does not show it, but the area corresponding to the overshoot is the medial
   4.196              surface of the cortex. MO is only found on the lateral surface (todo).
   4.197 -               Gnee mtif22 is shown in figure the upper-right of Fig. . Mtif2 captures MO’s
   4.198 +               Gnee mtif23 is shown in figure the upper-right of Fig. . Mtif2 captures MO’s
   4.199              upper-left boundary, but not its lower-right boundary.  Mtif2 does not express
   4.200              very much on the medial surface.  By adding together the values at each pixel
   4.201 -            in these two figures, we get the lower-left of Figure . This combination captures
   4.202 -            area MO much better than any single gene.
   4.203 -             Principle 2: Only look at combinations of small numbers of genes
   4.204 -            In order to see how well one can do when looking at all genes at once, we ran
   4.205 -            a support vector machine to classify cortical surface pixels based on their gene
   4.206 -            expression profiles. We achieved classification accuracy of about 81%3. As noted
   4.207 -            above, however, a classifier that looks at all the genes at once isn’t practically
   4.208 -            useful.
   4.209 -_____________________
   4.210 -   1“WW, C2 and coiled-coil domain containing 1”; EntrezGene ID 211652
   4.211 -    2“mitochondrial translational initiation factor 2”; EntrezGene ID 76784
   4.212 -    3Using the Shogun SVM package (todo:cite), with parameters type=GMNPSVM (multi-
   4.213 -class b-SVM), kernal = gaussian with sigma = 0.1, c = 10, epsilon = 1e-1 – these are the
   4.214 -first parameters we tried, so presumably performance would improve with different choices of
   4.215 -parameters. 5-fold cross-validation.
   4.216 -                                            5
   4.217 +__________________________
   4.218 +   2“WW, C2 and coiled-coil domain containing 1”; EntrezGene ID 211652
   4.219 +    3“mitochondrial translational initiation factor 2”; EntrezGene ID 76784
   4.220 +                                            7
   4.221  
   4.222                                          
   4.223              
   4.224 @@ -219,19 +306,33 @@
   4.225              the boundary of region MO. Pixels are colored approximately according to the
   4.226              density of expressing cells underneath each pixel, with red meaning a lot of
   4.227              expression and blue meaning little.
   4.228 -                                            6
   4.229 -
   4.230 -               The requirement to find combinations of only a small number of genes limits
   4.231 -            us from straightforwardly applying many of the most simple techniques from
   4.232 -            the field of supervised machine learning.  In the parlance of machine learning,
   4.233 -            our task combines feature selection with supervised learning.
   4.234 -             Principle 3: Use geometry
   4.235 +            in these two figures, we get the lower-left of Figure . This combination captures
   4.236 +            area MO much better than any single gene.
   4.237 +             Geometric and pointwise scoring methods provide complementary
   4.238 +            information
   4.239              To show that local geometry can provide useful information that cannot be
   4.240              detected via pointwise analyses, consider Fig.  .  The top row of Fig.   displays
   4.241              the 3 genes which most match area AUD, according to a pointwise method4. The
   4.242              bottom row displays the 3 genes which most match AUD according to a method
   4.243              which considers local geometry5 The pointwise method in the top row identifies
   4.244              genes which express more strongly in AUD than outside of it; its weakness is that
   4.245 +__________________________
   4.246 +   4For each gene, a logistic regression in which the response variable was whether or not a
   4.247 +surface pixel was within area AUD, and the predictor variable was the value of the expression
   4.248 +of the gene underneath that pixel. The resulting scores were used to rank the genes in terms
   4.249 +of how well they predict area AUD.
   4.250 +    5For each gene the gradient similarity (see section ??) between (a) a map of the expression
   4.251 +of each gene on the cortical surface and (b) the shape of area AUD, was calculated, and this
   4.252 +was used to rank the genes.
   4.253 +                                            8
   4.254 +
   4.255 +                                                        
   4.256 +                                                        
   4.257 +            Figure 2: The top row shows the three genes which (individually) best predict
   4.258 +            area AUD, according to logistic regression.  The bottom row shows the three
   4.259 +            genes which (individually) best match area AUD, according to gradient similar-
   4.260 +            ity. From left to right and top to bottom, the genes are Ssr1, Efcbp1, Aph1a,
   4.261 +            Ptk7, Aph1a again, and Lepr
   4.262              this includes many areas which don’t have a salient border matching the areal
   4.263              border. The geometric method identifies genes whose salient expression border
   4.264              seems to partially line up with the border of AUD; its weakness is that this
   4.265 @@ -240,163 +341,38 @@
   4.266              may be particularly good markers.   None of these genes are,  individually,  a
   4.267              perfect marker for AUD; we deliberately chose a “difficult” area in order to
   4.268              better contrast pointwise with geometric methods.
   4.269 -             Principle 4: Work in 2-D whenever possible
   4.270 -            In anatomy, the manifold of interest is usually either defined by a combination
   4.271 -            of two relevant anatomical axes (todo), or by the surface of the structure (as is
   4.272 -            the case with the cortex). In the former case, the manifold of interest is a plane,
   4.273 -            but in the latter case it is curved.  If the manifold is curved, there are various
   4.274 -            methods for mapping the manifold into a plane.
   4.275 -               The method that we will develop will begin by mapping the data into a
   4.276 -            2-D plane.  Although the manifold that characterized cortical areas is known
   4.277 -            to be the cortical surface, it remains to be seen which method of mapping the
   4.278 -            manifold into a plane is optimal for this application. We will compare mappings
   4.279 -            which attempt to preserve size (such as the one used by Caret??) with mappings
   4.280 -            which preserve angle (conformal maps).
   4.281 -               Although there is much 2-D organization in anatomy, there are also struc-
   4.282 -            tures whose shape is fundamentally 3-dimensional.  If possible, we would like
   4.283 -            the method we develop to include a statistical test that warns the user if the
   4.284 -            assumption of 2-D structure seems to be wrong.
   4.285 -               ——
   4.286 -____________________
   4.287 -   4For each gene, a logistic regression in which the response variable was whether or not a
   4.288 -surface pixel was within area AUD, and the predictor variable was the value of the expression
   4.289 -of the gene underneath that pixel. The resulting scores were used to rank the genes in terms
   4.290 -of how well they predict area AUD.
   4.291 -    5For each gene the gradient similarity (see section ??) between (a) a map of the expression
   4.292 -of each gene on the cortical surface and (b) the shape of area AUD, was calculated, and this
   4.293 -was used to rank the genes.
   4.294 -                                            7
   4.295 -
   4.296 -                                                        
   4.297 -                                                        
   4.298 -            Figure 2: The top row shows the three genes which (individually) best predict
   4.299 -            area AUD, according to logistic regression.  The bottom row shows the three
   4.300 -            genes which (individually) best match area AUD, according to gradient similar-
   4.301 -            ity. From left to right and top to bottom, the genes are Ssr1, Efcbp1, Aph1a,
   4.302 -            Ptk7, Aph1a again, and Lepr
   4.303 -               Massive new datasets obtained with techniques such as in situ hybridization
   4.304 -            (ISH) and BAC-transgenics allow the expression levels of many genes at many
   4.305 -            locations to be compared.  This can be used to find marker genes for specific
   4.306 -            anatomical structures, as well as to draw new anatomical maps.  Our goal is
   4.307 -            to develop automated methods to relate spatial variation in gene expression to
   4.308 -            anatomy. We have five specific aims:
   4.309 -             (1) develop an algorithm to screen spatial gene expression data for combi-
   4.310 -                 nations of marker genes which selectively target individual anatomical
   4.311 -                 structures
   4.312 -             (2) develop an algorithm to screen spatial gene expression data for combina-
   4.313 -                 tions of marker genes which can be used to delineate most of the bound-
   4.314 -                 aries between a number of anatomical structures at once
   4.315 -             (3) develop an algorithm to suggest new ways of dividing a structure up into
   4.316 -                 anatomical subregions, based on spatial patterns in gene expression
   4.317 -             (4) create a flat (2-D) map of the mouse cerebral cortex that contains a flat-
   4.318 -                 tened version of the Allen Mouse Brain Atlas ISH dataset, as well as the
   4.319 -                 boundaries of anatomical areas within the cortex. For each cortical layer,
   4.320 -                 a layer-specific flat dataset will be created. A single combined flat dataset
   4.321 -                 will be created which averages information from all of the layers.  These
   4.322 -                 datasets will be made available in both MATLAB and Caret formats.
   4.323 -             (5) validate the methods developed in (1), (2) and (3) by applying them to
   4.324 -                 the cerebral cortex datasets created in (4)
   4.325 -                                            8
   4.326 -
   4.327 -               All algorithms that we develop will be implemented in an open-source soft-
   4.328 -            ware toolkit. The toolkit, as well as the machine-readable datasets developed in
   4.329 -            aim (4) and any other intermediate dataset we produce, will be published and
   4.330 -            freely available for others to use.
   4.331 -               In addition to developing generally useful methods, the application of these
   4.332 -            methods to cerebral cortex will produce immediate benefits that are only one
   4.333 -            step removed from clinical application, while also supporting the development
   4.334 -            of new neuroanatomical techniques.  The method developed in aim (1) will be
   4.335 -            applied to each cortical area to find a set of marker genes.  Currently, despite
   4.336 -            the distinct roles of different cortical areas in both normal functioning and
   4.337 -            disease processes, there are no known marker genes for many cortical areas.
   4.338 -            Finding marker genes will be immediately useful for drug discovery as well as for
   4.339 -            experimentation because once marker genes for an area are known, interventions
   4.340 -            can be designed which selectively target that area.
   4.341 -               The method developed in aim (2) will be used to find a small panel of genes
   4.342 -            that can find most of the boundaries between areas in the cortex. Today, finding
   4.343 -            cortical areal boundaries in a tissue sample is a manual process that requires a
   4.344 -            skilled human to combine multiple visual cues over a large area of the cortical
   4.345 -            surface. A panel of marker genes will allow the development of an ISH protocol
   4.346 -            that will allow experimenters to more easily identify which anatomical areas are
   4.347 -            present in small samples of cortex.
   4.348 -               For each cortical layer, a layer-specific flat dataset will be created. A single
   4.349 -            combined flat dataset will be created which averages information from all of
   4.350 -            the layers. These datasets will be made available in both MATLAB and Caret
   4.351 -            formats.
   4.352 -___________________________________________________________
   4.353 -    New techniques allow the expression levels of many genes at many locations
   4.354 -to be compared. It is thought that even neighboring anatomical structures have
   4.355 -different gene expression profiles.  We propose to develop automated methods
   4.356 -to relate the spatial variation in gene expression to anatomy.  We will develop
   4.357 -two kinds of techniques:
   4.358 -  (a) techniques to screen for combinations of marker genes which selectively
   4.359 -       target anatomical structures
   4.360 -  (b) techniques to suggest new ways of dividing a structure up into anatomical
   4.361 -       subregions, based on the shapes of contours in the gene expression
   4.362 -    The first kind of technique will be helpful for finding marker genes associated
   4.363 -with known anatomical features. The second kind of technique will be helpful in
   4.364 -creating new anatomical maps, maps which reflect differences in gene expression
   4.365 -the same way that existing maps reflect differences in histology.
   4.366 -    We intend to develop our techniques using the adult mouse cerebral cortex
   4.367 -as a testbed.   The Allen Brain Atlas has collected a dataset containing the
   4.368 -expression level of about 4000 genes* over a set of over 150000 voxels, with a
   4.369 -spatial resolution of approximately 200 microns[?].
   4.370 +             Areas which can be identified by single genes
   4.371 +            todo
   4.372 +             Aim 1 (and Aim 3)
   4.373 +             SVM on all genes at once
   4.374 +            In order to see how well one can do when looking at all genes at once, we ran
   4.375 +            a support vector machine to classify cortical surface pixels based on their gene
   4.376 +            expression profiles. We achieved classification accuracy of about 81%6. As noted
   4.377 +            above, however, a classifier that looks at all the genes at once isn’t practically
   4.378 +            useful.
   4.379 +_____________________
   4.380 +   6Using the Shogun SVM package (todo:cite), with parameters type=GMNPSVM (multi-
   4.381 +class b-SVM), kernal = gaussian with sigma = 0.1, c = 10, epsilon = 1e-1 – these are the
   4.382 +first parameters we tried, so presumably performance would improve with different choices of
   4.383 +parameters. 5-fold cross-validation.
   4.384                                              9
   4.385  
   4.386 -               We expect to discover sets of marker genes that pick out specific cortical
   4.387 -            areas.  This will allow the development of drugs and other interventions that
   4.388 -            selectively target individual cortical areas.   Therefore our research will lead
   4.389 -            to application in drug discovery, in the development of other targeted clinical
   4.390 -            interventions, and in the development of new experimental techniques.
   4.391 -               The best way to divide up rodent cortex into areas has not been completely
   4.392 -            determined, as can be seen by the differences in the recent maps given by Swan-
   4.393 -            son on the one hand, and Paxinos and Franklin on the other. It is likely that our
   4.394 -            study, by showing which areal divisions naturally follow from gene expression
   4.395 -            data, as opposed to traditional histological data, will contribute to the creation
   4.396 -            of a better map. While we do not here propose to analyze human gene expres-
   4.397 -            sion data, it is conceivable that the methods we propose to develop could be
   4.398 -            used to suggest modifications to the human cortical map as well.
   4.399 -               In the following, we will only be talking about coronal data.
   4.400 -               The Allen Brain Atlas provides “Smoothed Energy Volumes”, which are
   4.401 -               One type of artifact in the Allen Brain Atlas data is what we call a “slice
   4.402 -            artifact”. We have noticed two types of slice artifacts in the dataset. The first
   4.403 -            type, a “missing slice artifact”, occurs when the ISH procedure on a slice did
   4.404 -            not come out well. In this case, the Allen Brain investigators excluded the slice
   4.405 -            at issue from the dataset.  This means that no gene expression information is
   4.406 -            available for that gene for the region of space covered by that slice. This results
   4.407 -            in an expression level of zero being assigned to voxels covered by the slice. This
   4.408 -            is partially but not completely ameliorated by the smoothing that is applied to
   4.409 -            create the Smoothed Energy Volumes. The usual end result is that a region of
   4.410 -            space which is shaped and oriented like a coronal slice is marked as having less
   4.411 -            gene expression than surrounding regions.
   4.412 -               The second type of slice artifact is caused by the fact that all of the slices
   4.413 -            have a consistent orientation.  Since there may be artifacts (such as how well
   4.414 -            the ISH worked) which are constant within each slice but which vary between
   4.415 -            different slices, the result is that ceteris paribus, when one compares the genetic
   4.416 -            data of a voxel to another voxel within the same coronal plane, one would expect
   4.417 -            to find more similarity than if one compared a voxel to another voxel displaced
   4.418 -            along the rostrocaudal axis.
   4.419 -               We are enthusiastic about the sharing of methods, data, and results, and
   4.420 -            at the conclusion of the project, we will make all of our data and computer
   4.421 -            source code publically available.  Our goal is that replicating our results, or
   4.422 -            applying the methods we develop to other targets, will be quick and easy for
   4.423 -            other investigators. In order to aid in understanding and replicating our results,
   4.424 -            we intend to include a software program which, when run, will take as input
   4.425 -            the Allen Brain Atlas raw data, and produce as output all numbers and charts
   4.426 -            found in publications resulting from the project.
   4.427 -               To aid in the replication of our results, we will include a script which takes
   4.428 -            as input the dataset in aim (3) and provides as output all of the tables in figures
   4.429 -            in our publications .
   4.430 -               We also expect to weigh in on the debate about how to best partition rodent
   4.431 -            cortex
   4.432 -                                            10
   4.433 -
   4.434 -               be useful for drug discovery as well
   4.435 -               * Another 16000 genes are available, but they do not cover the entire cerebral
   4.436 -            cortex with high spatial resolution.
   4.437 -               User-definable ROIs Combinatorial gene expression Negative as well as pos-
   4.438 -            itive signal Use geometry Search for local boundaries if necessary Flatmapped
   4.439 -             Specific aims
   4.440 +               The requirement to find combinations of only a small number of genes limits
   4.441 +            us from straightforwardly applying many of the most simple techniques from
   4.442 +            the field of supervised machine learning.  In the parlance of machine learning,
   4.443 +            our task combines feature selection with supervised learning.
   4.444 +             Decision trees
   4.445 +            todo
   4.446 +             Aim 2 (and Aim 3)
   4.447 +             Raw dimensionality reduction results
   4.448 +             Dimensionality reduction plus K-means or spectral clus-
   4.449 +            tering
   4.450 +             Many areas are captured by clusters of genes
   4.451 +            todo
   4.452 +               todo
   4.453 +             Research plan
   4.454 +            todo
   4.455 +               amongst other thigns:
   4.456              Develop algorithms that find genetic markers for anatomical regions
   4.457                1. Develop scoring measures for evaluating how good individual genes are at
   4.458                   marking areas:  we will compare pointwise, geometric, and information-
   4.459 @@ -415,6 +391,8 @@
   4.460                   the boundaries:  An area may be difficult to identify because the bound-
   4.461                   aries are misdrawn, or because it does not “really” exist as a single area,
   4.462                   at least on the genetic level. We will develop extensions to our procedure
   4.463 +                                            10
   4.464 +
   4.465                   which (a) detect when a difficult area could be fit if its boundary were
   4.466                   redrawn slightly, and (b) detect when a difficult area could be combined
   4.467                   with adjacent areas to create a larger area which can be fit.
   4.468 @@ -428,8 +406,6 @@
   4.469                   matically find the cortical layer boundaries.
   4.470                4. Run the procedures that we developed on the cortex: we will present, for
   4.471                   each area, a short list of markers to identify that area; and we will also
   4.472 -                                            11
   4.473 -
   4.474                   present lists of “panels” of genes that can be used to delineate many areas
   4.475                   at once.
   4.476              Develop algorithms to suggest a division of a structure into anatom-
   4.477 @@ -445,350 +421,27 @@
   4.478                   clustering to create anatomical maps
   4.479                6. Run this algorithm on the cortex: present a hierarchial, genoarchitectonic
   4.480                   map of the cortex
   4.481 -               gradient similarity is calculated as: ∑
   4.482 -  pixels cos(abs(∠∇1 - ∠∇2)) ⋅|∇1|+|∇2|
   4.483 -   2       ⋅
   4.484 -            pixel_value1+pixel_value2
   4.485 -         2
   4.486 -               (todo) Technically, we say that an anatomical structure has a fundamen-
   4.487 -            tally 2-D organization when there exists a commonly used, generic, anatomical
   4.488 -            structure-preserving map from 3-D space to a 2-D manifold.
   4.489 -               Related work:
   4.490 -               The Allen Brain Institute has developed an interactive web interface called
   4.491 -            AGEA which allows an investigator to (1) calculate lists of genes which are se-
   4.492 -            lectively overexpressed in certain anatomical regions (ABA calls this the “Gene
   4.493 -            Finder” function) (2) to visualize the correlation between the genetic profiles of
   4.494 -            voxels in the dataset, and (3) to visualize a hierarchial clustering of voxels in
   4.495 -            the dataset [?].  AGEA is an impressive and useful tool, however, it does not
   4.496 -            solve the same problems that we propose to solve with this project.
   4.497 -               First we describe AGEA’s “Gene Finder”, and then compare it to our pro-
   4.498 -            posed method for finding marker genes.  AGEA’s Gene Finder first asks the
   4.499 -            investigator to select a single “seed voxel” of interest. It then uses a clustering
   4.500 -            method, combined with built-in knowledge of major anatomical structures, to
   4.501 -            select two sets of voxels; an “ROI” and a “comparator region”*. The seed voxel
   4.502 -            is always contained within the ROI, and the ROI is always contained within the
   4.503 -            comparator region.  The comparator region is similar but not identical to the
   4.504 -            set of voxels making up the major anatomical region containing the ROI. Gene
   4.505 -            Finder then looks for genes which can distinguish the ROI from the comparator
   4.506 -            region. Specifically, it finds genes for which the ratio (expression energy in the
   4.507 -            ROI) / (expression energy in the comparator region) is high.
   4.508 +______________________________________________
   4.509 +    stuff  i  dunno  where  to  put  yet  (there  is  more  scattered  through  grant-
   4.510 +oldtext):
   4.511 +                                            11
   4.512 +
   4.513 +             Principle 4: Work in 2-D whenever possible
   4.514 +            In anatomy, the manifold of interest is usually either defined by a combination
   4.515 +            of two relevant anatomical axes (todo), or by the surface of the structure (as is
   4.516 +            the case with the cortex). In the former case, the manifold of interest is a plane,
   4.517 +            but in the latter case it is curved.  If the manifold is curved, there are various
   4.518 +            methods for mapping the manifold into a plane.
   4.519 +               The method that we will develop will begin by mapping the data into a
   4.520 +            2-D plane.  Although the manifold that characterized cortical areas is known
   4.521 +            to be the cortical surface, it remains to be seen which method of mapping the
   4.522 +            manifold into a plane is optimal for this application. We will compare mappings
   4.523 +            which attempt to preserve size (such as the one used by Caret??) with mappings
   4.524 +            which preserve angle (conformal maps).
   4.525 +               Although there is much 2-D organization in anatomy, there are also struc-
   4.526 +            tures whose shape is fundamentally 3-dimensional.  If possible, we would like
   4.527 +            the method we develop to include a statistical test that warns the user if the
   4.528 +            assumption of 2-D structure seems to be wrong.
   4.529                                              12
   4.530  
   4.531 -               Informally, the Gene Finder first infers an ROI based on clustering the seed
   4.532 -            voxel with other voxels.  Then, the Gene Finder finds genes which overexpress
   4.533 -            in the ROI as compared to other voxels in the major anatomical region.
   4.534 -               There are three major differences between our approach and Gene Finder.
   4.535 -               First, Gene Finder focuses on individual genes and individual ROIs in isola-
   4.536 -            tion. This is great for regions which can be picked out from all other regions by a
   4.537 -            single gene, but not all of them can (todo). There are at least two ways this can
   4.538 -            miss out on useful genes. First, a gene might express in part of a region, but not
   4.539 -            throughout the whole region, but there may be another gene which expresses
   4.540 -            in the rest of the region*. Second, a gene might express in a region, but not in
   4.541 -            any of its neighbors, but it might express also in other non-neighboring regions.
   4.542 -            To take advantage of these types of genes, we propose to find combinations of
   4.543 -            genes which, together, can identify the boundaries of all subregions within the
   4.544 -            containing region.
   4.545 -               Second, Gene Finder uses a pointwise metric, namely expression energy ratio,
   4.546 -            to decide whether a gene is good for picking out a region. We have found better
   4.547 -            results by using metrics which take into account not just single voxels, but also
   4.548 -            the local geometry of neighboring voxels, such as the local gradient (todo).  In
   4.549 -            addition, we have found that often the absence of gene expression can be used
   4.550 -            as a marker, which will not be caught by Gene Finder’s expression energy ratio
   4.551 -            (todo).
   4.552 -               Third, Gene Finder chooses the ROI based only on the seed voxel.  This
   4.553 -            often does not permit the user to query the ROI that they are interested in. For
   4.554 -            example, in all of our tests of Gene Finder in cortex, the ROIs chosen tend to
   4.555 -            be cortical layers, rather than cortical areas.
   4.556 -               In summary, when Gene Finder picks the ROI that you want, and when this
   4.557 -            ROI can be easily picked out from neighboring regions by single genes which
   4.558 -            selectively overexpress in the ROI compared to the entire major anatomical re-
   4.559 -            gion, Gene Finder will work. However, Gene Finder will not pick cortical areas
   4.560 -            as ROIs, and even if it could, many cortical areas cannot be uniquely picked out
   4.561 -            by the overexpression of any single gene.  By contrast, we will target cortical
   4.562 -            areas, we will explore a variety of metrics which can complement the shortcom-
   4.563 -            ings of expression energy ratio, and we will use the combinatorial expression of
   4.564 -            genes to pick out cortical areas even when no individual gene will do.
   4.565 -               * The terms “ROI” and “comparator region” are our own; the ABI calls
   4.566 -            them the “local region” and the “larger anatomical context”. The ABI uses the
   4.567 -            term “specificity comparator” to mean the major anatomic region containing
   4.568 -            the ROI, which is not exactly identical to the comparator region.
   4.569 -               ** In this case, the union of the area of expression of the two genes would
   4.570 -            suffice; one could also imagine that there could be situations in which the in-
   4.571 -            tersection of multiple genes would be needed, or a combination of unions and
   4.572 -            intersections.
   4.573 -               Now we describe AGEA’s hierarchial clustering, and compare it to our pro-
   4.574 -            posal. The goal of AGEA’s hierarchial clustering is to generate a binary tree of
   4.575 -            clusters, where a cluster is a collection of voxels.  AGEA begins by computing
   4.576 -            the Pearson correlation between each pair of voxels. They then employ a recur-
   4.577 -                                            13
   4.578 -
   4.579 -            sive divisive (top-down) hierarchial clustering procedure on the voxels, which
   4.580 -            means that they start with all of the voxels, and then they divide them into clus-
   4.581 -            ters, and then within each cluster, they divide that cluster into smaller clusters,
   4.582 -            etc***.  At each step, the collection of voxels is partitioned into two smaller
   4.583 -            clusters in a way that maximizes the following quantity:  average correlation
   4.584 -            between all possible pairs of voxels containing one voxel from each cluster.
   4.585 -               There are three major differences between our approach and AGEA’s hier-
   4.586 -            archial clustering.  First, AGEA’s clustering method separates cortical layers
   4.587 -            before it separates cortical areas.
   4.588 -               following procedure is used for the purpose of dividing a collection of voxels
   4.589 -            into smaller clusters: partition the voxels into two sets, such that the following
   4.590 -            quantity is maximized:
   4.591 -               *** depending on which level of the tree is being created, the voxels are
   4.592 -            subsampled in order to save time
   4.593 -               does not allow the user to input anything other than a seed voxel; this means
   4.594 -            that for each seed voxel, there is only one
   4.595 -               The role of the “local region” is to serve as a region of interest for which
   4.596 -            marker genes are desired; the role of the “larger anatomical context” is to be
   4.597 -            the structure
   4.598 -               There are two kinds of differences between AGEA and our project; differ-
   4.599 -            ences that relate to the treatment of the cortex, and differences in the type of
   4.600 -            generalizable methods being developed. As relates
   4.601 -               indicate an ROI
   4.602 -               explore simple correlation-based relationships between voxels,  genes,  and
   4.603 -            clusters of voxels.
   4.604 -               There have not yet been any studies which describe the results of applying
   4.605 -            AGEA to the cerebral cortex; however, we suspect that the AGEA metrics are
   4.606 -            not optimal for the task of relating genes to cortical areas.   A voxel’s gene
   4.607 -            expression profile depends upon both its cortical area and its cortical layer,
   4.608 -            however, AGEA has no mechanism to distinguish these two. As a result, voxels
   4.609 -            in the same layer but different areas are often clustered together by AGEA. As
   4.610 -            part of the project, we will compare the performance of our techniques against
   4.611 -            AGEA’s.
   4.612 -               —
   4.613 -               The Allen Brain Institute has developed interactive tools called AGEA which
   4.614 -            allow an investigator to explore simple correlation-based relationships between
   4.615 -            voxels,  genes,  and clusters of voxels.   There have not yet been any studies
   4.616 -            which describe the results of applying AGEA to the cerebral cortex; however,
   4.617 -            we suspect that the AGEA metrics are not optimal for the task of relating
   4.618 -            genes to cortical areas.  A voxel’s gene expression profile depends upon both
   4.619 -            its cortical area and its cortical layer, however, AGEA has no mechanism to
   4.620 -            distinguish these two.  As a result, voxels in the same layer but different areas
   4.621 -            are often clustered together by AGEA. As part of the project, we will compare
   4.622 -            the performance of our techniques against AGEA’s.
   4.623 -               Another difference between our techniques and AGEA’s is that AGEA allows
   4.624 -            the user to enter only a voxel location, and then to either explore the rest of
   4.625 -                                            14
   4.626 -
   4.627 -            the brain’s relationship to that particular voxel, or explore a partitioning of
   4.628 -            the brain based on pairwise voxel correlation. If the user is interested not in a
   4.629 -            single voxel, but rather an entire anatomical structure, AGEA will only succeed
   4.630 -            to the extent that the selected voxel is a typical representative of the structure.
   4.631 -            As discussed in the previous paragraph, this poses problems for structures like
   4.632 -            cortical areas, which (because of their division into cortical layers) do not have
   4.633 -            a single “typical representative”.
   4.634 -               By contrast, in our system, the user will start by selecting, not a single voxel,
   4.635 -            but rather, an anatomical superstructure to be divided into pieces (for example,
   4.636 -            the cerebral cortex).  We expect that our methods will take into account not
   4.637 -            just pairwise statistics between voxels, but also large-scale geometric features
   4.638 -            (for example, the rapidity of change in gene expression as regional boundaries
   4.639 -            are crossed) which optimize the discriminability of regions within the selected
   4.640 -            superstructure.
   4.641 -               —–
   4.642 -               screen for combinations of marker genes which selectively target anatom-
   4.643 -            ical structures pick delineate the boundaries between neighboring anatomical
   4.644 -            structures. (b) techniques to screen for marker genes which pick out anatomical
   4.645 -            structures of interest
   4.646 -               ,  techniques  which:  (a)  screen  for  marker  genes  ,  and  (b)  suggest  new
   4.647 -            anatomical maps based on
   4.648 -               whose expression partitions the region of interest into its anatomical sub-
   4.649 -            structures, and (b) use the natural contours of gene expression to suggest new
   4.650 -            ways of dividing an organ into
   4.651 -               The Allen Brain Atlas
   4.652 -               –
   4.653 -               to: brooksl@mail.nih.gov
   4.654 -               Hi, I’m writing to confirm the applicability of a potential research project to
   4.655 -            the challenge grant topic ”New computational and statistical methods for the
   4.656 -            analysis of large data sets from next-generation sequencing technologies”.
   4.657 -               We want to develop methods for the analysis of gene expression datasets that
   4.658 -            can be used to uncover the relationships between gene expression and anatomical
   4.659 -            regions. Specifically, we want to develop techniques to (a) given a set of known
   4.660 -            anatomical areas, identify genetic markers for each of these areas, and (b) given
   4.661 -            an anatomical structure whose substructure is unknown, suggest a map, that
   4.662 -            is, a division of the space into anatomical sub-structures, that represents the
   4.663 -            boundaries inherent in the gene expression data.
   4.664 -               We propose to develop our techniques on the Allen Brain Atlas mouse brain
   4.665 -            gene expression dataset by finding genetic markers for anatomical areas within
   4.666 -            the cerebral cortex.  The Allen Brain Atlas contains a registered 3-D map of
   4.667 -            gene expression data with 200-micron voxel resolution which was created from
   4.668 -            in situ hybridization data.  The dataset contains about 4000 genes which are
   4.669 -            available at this resolution across the entire cerebral cortex.
   4.670 -               Despite the distinct roles of different cortical areas in both normal function-
   4.671 -            ing and disease processes, there are no known marker genes for many cortical
   4.672 -            areas. This project will be immediately useful for both drug discovery and clini-
   4.673 -                                            15
   4.674 -
   4.675 -            cal research because once the markers are known, interventions can be designed
   4.676 -            which selectively target specific cortical areas.
   4.677 -               This techniques we develop will be useful because they will be applicable to
   4.678 -            the analysis of other anatomical areas, both in terms of finding marker genes
   4.679 -            for known areas, and in terms of suggesting new anatomical subdivisions that
   4.680 -            are based upon the gene expression data.
   4.681 -_______________________________
   4.682 -    It is likely that our study, by showing which areal divisions naturally fol-
   4.683 -low from gene expression data, as opposed to traditional histological data, will
   4.684 -contribute to the creation of
   4.685 -    there are clear genetic or chemical markers known for only a few cortical
   4.686 -areas. This makes it difficult to target drugs to specific
   4.687 -    As part of aims (1) and (5), we will discover sets of marker genes that pick
   4.688 -out specific cortical areas.  This will allow the development of drugs and other
   4.689 -interventions that selectively target individual cortical areas.  As part of aims
   4.690 -(2) and (5), we will also discover small panels of marker genes that can be used
   4.691 -to delineate most of the cortical areal map.
   4.692 -    With aims (2) and (4), we
   4.693 -    There are five principals
   4.694 -    In addition to validating the usefulness of the algorithms, the application of
   4.695 -these methods to cerebral cortex will produce immediate benefits that are only
   4.696 -one step removed from clinical application.
   4.697 -    todo: remember to check gensat, etc for validation (mention bias/variance)
   4.698 - Why it is useful to apply these methods to cortex
   4.699 -There is still room for debate as to exactly how the cortex should be parcellated
   4.700 -into areas.
   4.701 -    The best way to divide up rodent cortex into areas has not been completely
   4.702 -determined,
   4.703 -    not yet been accounted for in
   4.704 -    that the expression of some genes will contain novel spatial patterns which
   4.705 -are not account
   4.706 -    that a genoarchitectonic map
   4.707 -    This principle is only applicable to aim 1 (marker genes). For aim 2 (partition
   4.708 -a structure in into anatomical subregions), we plan to work with many genes at
   4.709 -once.
   4.710 -    tood: aim 2 b+s?
   4.711 - Principle 5: Interoperate with existing tools
   4.712 -In order for our software to be as useful as possible for our users, it will be
   4.713 -able to import and export data to standard formats so that users can use our
   4.714 -software in tandem with other software tools created by other teams.  We will
   4.715 -support the following formats:  NIFTI (Neuroimaging Informatics Technology
   4.716 -                                            16
   4.717 -
   4.718 -            Initiative), SEV (Allen Brain Institute Smoothed Energy Volume), and MAT-
   4.719 -            LAB. This ensures that our users will not have to exclusively rely on our tools
   4.720 -            when analyzing data. For example, users will be able to use the data visualiza-
   4.721 -            tion and analysis capabilities of MATLAB and Caret alongside our software.
   4.722 -               To our knowledge, there is no currently available software to convert between
   4.723 -            these formats, so we will also provide a format conversion tool.  This may be
   4.724 -            useful even for groups that don’t use any of our other software.
   4.725 -               todo: is “marker gene” even a phrase that we should use at all?
   4.726 -               note for aim 1 apps: combo of genes is for voxel, not within any single cell
   4.727 -               , as when genetic markers allow the development of selective interventions;
   4.728 -            the reason that one can be confident that the intervention is selective is that it
   4.729 -            is only turned on when a certain combination of genes is turned on and off. The
   4.730 -            result procedure is what assures us that when that combination is present, the
   4.731 -            local tissue is probably part of a certain subregion.
   4.732 -               The basic idea is that we want to find a procedure by
   4.733 -               The task of finding genes that mark anatomical areas can be phrased in
   4.734 -            terms of what the field of machine learning calls a “supervised learning” task.
   4.735 -            The goal of this task is to learn a function (the “classifier”) which
   4.736 -               If a person knows a combination of genes that mark an area, that implies
   4.737 -            that the person can be told how strong those genes express in any voxel, and
   4.738 -            the person can use this information to determine how
   4.739 -               finding how to infer the areal identity of a voxel if given the gene expression
   4.740 -            profile of that voxel.
   4.741 -               For each voxel in the cortex, we want to start with data about the gene
   4.742 -            expression
   4.743 -               There are various ways to look for marker genes. We will define some terms,
   4.744 -            and along the way we will describe a few design choices encountered in the
   4.745 -            process of creating a marker gene finding method, and then we will present four
   4.746 -            principles that describe which options we have chosen.
   4.747 -               In developing a procedure for finding marker genes,  we are developing a
   4.748 -            procedure that takes a dataset of experimental observations and produces a
   4.749 -            result. One can think of the result as merely a list of genes, but really the result
   4.750 -            is an understanding of a predictive relationship between, on the one hand, the
   4.751 -            expression levels of genes, and, on the other hand, anatomical subregions.
   4.752 -               One way to more formally define this understanding is to look at it as a
   4.753 -            procedure. In this view, the result of the learning procedure is itself a procedure.
   4.754 -            The result procedure provides a way to use the gene expression profiles of voxels
   4.755 -            in a tissue sample in order to determine where the subregions are.
   4.756 -               This result procedure can be used directly, as when an experimenter has
   4.757 -            a tissue sample and needs to know what subregions are present in it,  and,
   4.758 -            if multiple subregions are present,  where they each are.   Or it can be used
   4.759 -            indirectly; imagine that the result procedure tells us that whenever a certain
   4.760 -            combination of genes are expressed, the local tissue is probably part of a certain
   4.761 -            subregion.  This means that we can then confidentally develop an intervention
   4.762 -            which is triggered only when that combination of genes are expressed; and to
   4.763 -                                            17
   4.764 -
   4.765 -            the extent that the result procedure is reliable, we know that the intervention
   4.766 -            will only be triggered in the target subregion.
   4.767 -               We said that the result procedure provides “a way to use the gene expression
   4.768 -            profiles of voxels in a tissue sample” in order to “determine where the subregions
   4.769 -            are”.
   4.770 -               Does the result procedure get as input all of the gene expression profiles
   4.771 -            of each voxel in the entire tissue sample,  and produce as output all of the
   4.772 -            subregional boundaries all at once?
   4.773 -               it is helpful for the classifier to look at the global “shape” of gene expression
   4.774 -            patterns over the whole structure, rather than just nearby voxels.
   4.775 -               there is some small bit of additional information that can be gleaned from
   4.776 -            knowing the
   4.777 -             Design choices for a supervised learning procedure
   4.778 -            After all,
   4.779 -               there is a small correlation between the gene expression levels from distant
   4.780 -            voxels and
   4.781 -               Depending on how we intend to use the classifier, we may want to design it
   4.782 -            so that
   4.783 -               It is possible for many things to
   4.784 -               The choice of which data is made part of an instance
   4.785 -               what we seek is a procedure
   4.786 -               partition the tissue sample into subregions.
   4.787 -               each part of the anatomical structure
   4.788 -               must be One way to rephrase this task is to say that, instead of searching
   4.789 -            for the location of the subregions, we are looking to partition the tissue sample
   4.790 -            into subregions.
   4.791 -               There are various ways to look for marker genes. We will define some terms,
   4.792 -            and along the way we will describe a few design choices encountered in the
   4.793 -            process of creating a marker gene finding method, and then we will present four
   4.794 -            principles that describe which options we have chosen.
   4.795 -               In developing a procedure for finding marker genes,  we are developing a
   4.796 -            procedure that takes a dataset of experimental observations and produces a
   4.797 -            result. One can think of the result as merely a list of genes, but really the result
   4.798 -            is an understanding of a predictive relationship between, on the one hand, the
   4.799 -            expression levels of genes, and, on the other hand, anatomical subregions.
   4.800 -               One way to more formally define this understanding is to look at it as a
   4.801 -            procedure. In this view, the result of the learning procedure is itself a procedure.
   4.802 -            The result procedure provides a way to use the gene expression profiles of voxels
   4.803 -            in a tissue sample in order to determine where the subregions are.
   4.804 -               This result procedure can be used directly, as when an experimenter has
   4.805 -            a tissue sample and needs to know what subregions are present in it,  and,
   4.806 -            if multiple subregions are present,  where they each are.   Or it can be used
   4.807 -            indirectly; imagine that the result procedure tells us that whenever a certain
   4.808 -            combination of genes are expressed, the local tissue is probably part of a certain
   4.809 -                                            18
   4.810 -
   4.811 -            subregion.  This means that we can then confidentally develop an intervention
   4.812 -            which is triggered only when that combination of genes are expressed; and to
   4.813 -            the extent that the result procedure is reliable, we know that the intervention
   4.814 -            will only be triggered in the target subregion.
   4.815 -               We said that the result procedure provides “a way to use the gene expression
   4.816 -            profiles of voxels in a tissue sample” in order to “determine where the subregions
   4.817 -            are”.
   4.818 -               Does the result procedure get as input all of the gene expression profiles
   4.819 -            of each voxel in the entire tissue sample,  and produce as output all of the
   4.820 -            subregional boundaries all at once?
   4.821 -               Or are we given one voxel at a time,
   4.822 -               In the jargon of the field of machine learning, the result procedure is called
   4.823 -            a classifier.
   4.824 -               The task of finding genes that mark anatomical areas can be phrased in
   4.825 -            terms of what the field of machine learning calls a “supervised learning” task.
   4.826 -            The goal of this task is to learn a function (the “classifier”) which
   4.827 -               If a person knows a combination of genes that mark an area, that implies
   4.828 -            that the person can be told how strong those genes express in any voxel, and
   4.829 -            the person can use this information to determine how
   4.830 -               finding how to infer the areal identity of a voxel if given the gene expression
   4.831 -            profile of that voxel.
   4.832 -               For each voxel in the cortex, we want to start with data about the gene
   4.833 -            expression
   4.834 -               single voxels, but rather groups of voxels, such that the groups can be placed
   4.835 -            in some 2-D space. We will call such instances “pixels”.
   4.836 -               We have been speaking as if instances necessarily correspond to single voxels.
   4.837 -            But it is possible for instances to be groupings of many voxels, in which case
   4.838 -            each grouping must be assigned the same label (that is, each voxel grouping
   4.839 -            must stay inside a single anatomical subregion).
   4.840 -               In some but not all cases, the groups are either rows or columns of voxels.
   4.841 -            This is the case with the cerebral cortex, in which one may assume that columns
   4.842 -            of voxels which run perpendicular to the cortical surface all share the same areal
   4.843 -            identity. In the cortex, we call such an instance a “surface pixel”, because such
   4.844 -            an instance represents the data associated with all voxels underneath a specific
   4.845 -            patch of the cortical surface.
   4.846 -                                            19
   4.847 -
   4.848 -
   4.849 +
     5.1 Binary file grant.odt has changed
     6.1 Binary file grant.pdf has changed
     7.1 --- a/grant.txt	Sat Apr 11 21:43:12 2009 -0700
     7.2 +++ b/grant.txt	Sun Apr 12 02:49:55 2009 -0700
     7.3 @@ -1,7 +1,5 @@
     7.4  == Specific aims ==
     7.5  
     7.6 -todo test4
     7.7 -
     7.8  Massive new datasets obtained with techniques such as in situ hybridization (ISH) and BAC-transgenics allow the expression levels of many genes at many locations to be compared. Our goal is to develop automated methods to relate spatial variation in gene expression to anatomy. We want to find marker genes for specific anatomical regions, and also to draw new anatomical maps based on gene expression patterns. We have three specific aims:
     7.9  
    7.10  (1) develop an algorithm to screen spatial gene expression data for combinations of marker genes which selectively target anatomical regions
    7.11 @@ -17,12 +15,12 @@
    7.12  == Background and significance ==
    7.13  
    7.14  === Aim 1 ===
    7.15 -==== Machine learning terminology ====
    7.16 +==== Machine learning terminology: supervised learning ====
    7.17  The task of looking for marker genes for anatomical subregions means that one is looking for a set of genes such that, if the expression level of those genes is known, then the locations of the subregions can be inferred. 
    7.18  
    7.19  If we define the subregions so that they cover the entire anatomical structure to be divided, then instead of saying that we are using gene expression to find the locations of the subregions, we may say that we are using gene expression to determine to which subregion each voxel within the structure belongs. We call this a __classification task__, because each voxel is being assigned to a class (namely, its subregion).
    7.20  
    7.21 -Therefore, an understanding of the relationship between the combination of their expression levels and the locations of the subregions may be expressed as a function. The input to this function is a voxel, along with the gene expression levels within that voxel; the output is the subregional identity of the target voxel, that is, the subregion to which the target voxel belongs. We call this function a __classifier__. In general, the input to a classifier is called an __instance__, and the output is called a __label__. 
    7.22 +Therefore, an understanding of the relationship between the combination of their expression levels and the locations of the subregions may be expressed as a function. The input to this function is a voxel, along with the gene expression levels within that voxel; the output is the subregional identity of the target voxel, that is, the subregion to which the target voxel belongs. We call this function a __classifier__. In general, the input to a classifier is called an __instance__, and the output is called a __label__ (or a __class label__).
    7.23  
    7.24  The object of aim 1 is not to produce a single classifier, but rather to develop an automated method for determining a classifier for any known anatomical structure. Therefore, we seek a procedure by which a gene expression dataset may be analyzed in concert with an anatomical atlas in order to produce a classifier. Such a procedure is a type of a machine learning procedure. The construction of the classifier is called __training__ (also __learning__), and the initial gene expression dataset used in the construction of the classifier is called __training data__.
    7.25  
    7.26 @@ -57,7 +55,37 @@
    7.27  
    7.28  
    7.29  === Aim 2 ===
    7.30 -todo
    7.31 +==== Machine learning terminology: clustering ====
    7.32 +If one is given a dataset consisting merely of instances, with no class labels, then analysis of the dataset is referred to as __unsupervised learning__ in the jargon of machine learning. One thing that you can do with such a dataset is to group instances together. A set of similar instances is called a __cluster__, and the activity of finding grouping the data into clusters is called clustering or cluster analysis.
    7.33 +
    7.34 +The task of deciding how to carve up a structure into anatomical subregions can be put into these terms. The instances are once again voxels (or pixels) along with their associated gene expression profiles. We make the assumption that voxels from the same subregion have similar gene expression profiles, at least compared to the other subregions. This means that clustering voxels is the same as finding potential subregions; we seek a partitioning of the voxels into subregions, that is, into clusters of voxels with similar gene expression.
    7.35 +
    7.36 +It is desirable to determine not just one set of subregions, but also how these subregions relate to each other, if at all; perhaps some of the subregions are more similar to each other than to the rest, suggesting that, although at a fine spatial scale they could be considered separate, on a coarser spatial scale they could be grouped together into one large subregion. This suggests the outcome of clustering may be a hierarchial tree of clusters, rather than a single set of clusters which partition the voxels. This is called hierarchial clustering.
    7.37 +
    7.38 +==== Similarity scores ====
    7.39 +
    7.40 +todo
    7.41 +
    7.42 +==== Spatially contiguous clusters; image segmentation ====
    7.43 +
    7.44 +We have shown that aim 2 is a type of clustering task. In fact, it is a special type of clustering task because we have an additional constraint on clusters; voxels grouped together into a cluster must be spatially contiguous. In Preliminary Results, we show that one can get reasonable results without enforcing this constraint, however, we plan to compare these results against other methods which guarantee contiguous clusters.
    7.45 +
    7.46 +Perhaps the biggest source of continguous clustering algorithms is the field of computer vision, which has produced a variety of image segmentation algorithms. Image segmentation is the task of partitioning the pixels in a digital image into clusters, usually contiguous clusters. Aim 2 is similar to an image segmentation task. There are two main differences; in our task, there are thousands of color channels (one for each gene), rather than just three. There are imaging tasks which use more than three colors, however, for example multispectral imaging and hyperspectral imaging, which are often used to process satellite imagery. A more crucial difference is that there are various cues which are appropriate for detecting sharp object boundaries in a visual scene but which are not appropriate for segmenting abstract spatial data such as gene expression. Although many image segmentation algorithms can be expected to work well for segmenting other sorts of spatially arranged data, some of these algorithms are specialized for visual images.
    7.47 +
    7.48 +==== Dimensionality reduction ====
    7.49 +
    7.50 +Unlike aim 1, there is no externally-imposed need to select only a handful of informative genes for inclusion in the instances. However, some clustering algorithms perform better on small numbers of features. There are techniques which "summarize" a larger number of features using a smaller number of features; these techniques go by the name of feature extraction or dimensionality reduction. The small set of features that such a technique yields is called the __reduced feature set__. After the reduced feature set is created, the instances may be replaced by __reduced instances__, which have as their features the reduced feature set rather than the original feature set of all gene expression levels. Note that the features in the reduced feature set do not necessarily correspond to genes; each feature in the reduced set may be any function of the set of gene expression levels.
    7.51 +
    7.52 +Another use for dimensionality reduction is to visualize the relationships between subregions. For example, one might want to make a 2-D plot upon which each subregion is represented by a single point, and with the property that subregions with similar gene expression profiles should be nearby on the plot (that is, the property that distance between pairs of points in the plot should be proportional to some measure of dissimilarity in gene expression). It is likely that no arrangement of the points on a 2-D plan will exactly satisfy this property -- however, dimensionality reduction techniques allow one to find arrangements of points that approximately satisfy that property. Note that in this application, dimensionality reduction is being applied after clustering; whereas in the previous paragraph, we were talking about using dimensionality reduction before clustering.
    7.53 +
    7.54 +==== Clustering genes rather than voxels ====
    7.55 +
    7.56 +Although the ultimate goal is to cluster the instances (voxels or pixels), one strategy to achieve this goal is to first cluster the features (genes). There are two ways that clusters of genes could be used. 
    7.57 +
    7.58 +Gene clusters could be used as part of dimensionality reduction: rather than have one feature for each gene, we could have one reduced feature for each gene cluster.
    7.59 +
    7.60 +Gene clusters could also be used to directly yield a clustering on instances. This is because many genes have an expression pattern which seems to pick out a single, spatially continguous subregion. Therefore, it seems likely that an anatomically interesting subregion will have multiple genes which each individually pick it out\footnote{This would seem to contradict our finding in aim 1 that some cortical areas are combinatorially coded by multiple genes. However, it is possible that the currently accepted cortical maps divide the cortex into subregions which are unnatural from the point of view of gene expression; perhaps there is some other way to map the cortex for which each subregion can be identified by single genes.}. This suggests the following procedure: cluster together genes which pick out similar subregions, and then to use the more popular common subregions as the final clusters. In the Preliminary Data we show that a number of anatomically recognized cortical regions, as well as some "superregions" formed by lumping together a few regions, are associated with gene clusters in this fashion.
    7.61 +
    7.62  
    7.63  
    7.64  
    7.65 @@ -85,11 +113,19 @@
    7.66  === Related work ===
    7.67  todo
    7.68  
    7.69 +vs. AGEA -- i wrote something on this but i'm going to rewrite it
    7.70 +
    7.71  
    7.72  == Preliminary work ==
    7.73  
    7.74 -=== Justification of principles 1 thur 3 ===
    7.75 -==== Principle 1: Combinatorial gene expression ====
    7.76 +=== Format conversion between SEV, MATLAB, NIFTI ===
    7.77 +todo
    7.78 +
    7.79 +=== Flatmap of cortex ===
    7.80 +todo
    7.81 +
    7.82 +
    7.83 +==== Using combinations of multiple genes is necessary and sufficient to delineate some cortical areas ====
    7.84  Here we give an example of a cortical area which is not marked by any single gene, but which can be identified combinatorially. according to logistic regression, gene wwc1\footnote{"WW, C2 and coiled-coil domain containing 1"; EntrezGene ID 211652} is the best fit single gene for predicting whether or not a pixel on the cortical surface belongs to the motor area (area MO). The upper-left picture in Figure \ref{MOcombo} shows wwc1's spatial expression pattern over the cortex. The lower-right boundary of MO is represented reasonably well by this gene, however the gene overshoots the upper-left boundary. This flattened 2-D representation does not show it, but the area corresponding to the overshoot is the medial surface of the cortex. MO is only found on the lateral surface (todo).
    7.85  
    7.86  Gnee mtif2\footnote{"mitochondrial translational initiation factor 2"; EntrezGene ID 76784} is shown in figure the upper-right of Fig. \ref{MOcombo}. Mtif2 captures MO's upper-left boundary, but not its lower-right boundary. Mtif2 does not express very much on the medial surface. By adding together the values at each pixel in these two figures, we get the lower-left of Figure \ref{MOcombo}. This combination captures area MO much better than any single gene. 
    7.87 @@ -102,13 +138,8 @@
    7.88  \caption{Upper left: $wwc1$. Upper right: $mtif2$. Lower left: wwc1 + mtif2 (each pixel's value on the lower left is the sum of the corresponding pixels in the upper row). Within each picture, the vertical axis roughly corresponds to anterior at the top and posterior at the bottom, and the horizontal axis roughly corresponds to medial at the left and lateral at the right. The red outline is the boundary of region MO. Pixels are colored approximately according to the density of expressing cells underneath each pixel, with red meaning a lot of expression and blue meaning little.}
    7.89  \end{figure}
    7.90  
    7.91 -==== Principle 2: Only look at combinations of small numbers of genes ====
    7.92 -In order to see how well one can do when looking at all genes at once, we ran a support vector machine to classify cortical surface pixels based on their gene expression profiles. We achieved classification accuracy of about 81%\footnote{Using the Shogun SVM package (todo:cite), with parameters type=GMNPSVM (multiclass b-SVM), kernal = gaussian with sigma = 0.1, c = 10, epsilon = 1e-1 -- these are the first parameters we tried, so presumably performance would improve with different choices of parameters. 5-fold cross-validation.}. As noted above, however, a classifier that looks at all the genes at once isn't practically useful. 
    7.93 -
    7.94 -The requirement to find combinations of only a small number of genes limits us from straightforwardly applying many of the most simple techniques from the field of supervised machine learning. In the parlance of machine learning, our task combines feature selection with supervised learning.
    7.95 -
    7.96 -
    7.97 -==== Principle 3: Use geometry ====
    7.98 +
    7.99 +==== Geometric and pointwise scoring methods provide complementary information ====
   7.100  
   7.101  
   7.102  To show that local geometry can provide useful information that cannot be detected via pointwise analyses, consider Fig. \ref{AUDgeometry}. The top row of Fig. \ref{AUDgeometry} displays the 3 genes which most match area AUD, according to a pointwise method\footnote{For each gene, a logistic regression in which the response variable was whether or not a surface pixel was within area AUD, and the predictor variable was the value of the expression of the gene underneath that pixel. The resulting scores were used to rank the genes in terms of how well they predict area AUD.}. The bottom row displays the 3 genes which most match AUD according to a method which considers local geometry\footnote{For each gene the gradient similarity (see section \ref{gradientSim}) between (a) a map of the expression of each gene on the cortical surface and (b) the shape of area AUD, was calculated, and this was used to rank the genes.} The pointwise method in the top row identifies genes which express more strongly in AUD than outside of it; its weakness is that this includes many areas which don't have a salient border matching the areal border. The geometric method identifies genes whose salient expression border seems to partially line up with the border of AUD; its weakness is that this includes genes which don't express over the entire area. Genes which have high rankings using both pointwise and border criteria, such as $Aph1a$ in the example, may be particularly good markers. None of these genes are, individually, a perfect marker for AUD; we deliberately chose a "difficult" area in order to better contrast pointwise with geometric methods.
   7.103 @@ -125,130 +156,51 @@
   7.104  \caption{The top row shows the three genes which (individually) best predict area AUD, according to logistic regression. The bottom row shows the three genes which (individually) best match area AUD, according to gradient similarity. From left to right and top to bottom, the genes are $Ssr1$, $Efcbp1$, $Aph1a$, $Ptk7$, $Aph1a$ again, and $Lepr$}
   7.105  \end{figure}
   7.106  
   7.107 -
   7.108 -
   7.109 -
   7.110 -
   7.111 -==== Principle 4: Work in 2-D whenever possible ====
   7.112 -
   7.113 -In anatomy, the manifold of interest is usually either defined by a combination of two relevant anatomical axes (todo), or by the surface of the structure (as is the case with the cortex). In the former case, the manifold of interest is a plane, but in the latter case it is curved. If the manifold is curved, there are various methods for mapping the manifold into a plane.
   7.114 -
   7.115 -The method that we will develop will begin by mapping the data into a 2-D plane. Although the manifold that characterized cortical areas is known to be the cortical surface, it remains to be seen which method of mapping the manifold into a plane is optimal for this application. We will compare mappings which attempt to preserve size (such as the one used by Caret\ref{van_essen_integrated_2001}) with mappings which preserve angle (conformal maps). 
   7.116 -
   7.117 -Although there is much 2-D organization in anatomy, there are also structures whose shape is fundamentally 3-dimensional. If possible, we would like the method we develop to include a statistical test that warns the user if the assumption of 2-D structure seems to be wrong.
   7.118 -
   7.119 -
   7.120 -
   7.121 -
   7.122 -
   7.123 -
   7.124 -
   7.125 -------
   7.126 -
   7.127 -
   7.128 -
   7.129 -Massive new datasets obtained with techniques such as in situ hybridization (ISH) and BAC-transgenics allow the expression levels of many genes at many locations to be compared. This can be used to find marker genes for specific anatomical structures, as well as to draw new anatomical maps. Our goal is to develop automated methods to relate spatial variation in gene expression to anatomy. We have five specific aims:
   7.130 -
   7.131 -(1) develop an algorithm to screen spatial gene expression data for combinations of marker genes which selectively target individual anatomical structures
   7.132 -(2) develop an algorithm to screen spatial gene expression data for combinations of marker genes which can be used to delineate most of the boundaries between a number of anatomical structures at once
   7.133 -(3) develop an algorithm to suggest new ways of dividing a structure up into anatomical subregions, based on spatial patterns in gene expression
   7.134 -(4) create a flat (2-D) map of the mouse cerebral cortex that contains a flattened version of the Allen Mouse Brain Atlas ISH dataset, as well as the boundaries of anatomical areas within the cortex. For each cortical layer, a layer-specific flat dataset will be created. A single combined flat dataset will be created which averages information from all of the layers. These datasets will be made available in both MATLAB and Caret formats.
   7.135 -(5) validate the methods developed in (1), (2) and (3) by applying them to the cerebral cortex datasets created in (4)
   7.136 -
   7.137 -All algorithms that we develop will be implemented in an open-source software toolkit. The toolkit, as well as the machine-readable datasets developed in aim (4) and any other intermediate dataset we produce, will be published and freely available for others to use. 
   7.138 -
   7.139 -In addition to developing generally useful methods, the application of these methods to cerebral cortex will produce immediate benefits that are only one step removed from clinical application, while also supporting the development of new neuroanatomical techniques. The method developed in aim (1) will be applied to each cortical area to find a set of marker genes. Currently, despite the distinct roles of different cortical areas in both normal functioning and disease processes, there are no known marker genes for many cortical areas. Finding marker genes will be immediately useful for drug discovery as well as for experimentation because once marker genes for an area are known, interventions can be designed which selectively target that area. 
   7.140 -
   7.141 -
   7.142 -
   7.143 -
   7.144 -
   7.145 -
   7.146 -
   7.147 -The method developed in aim (2) will be used to find a small panel of genes that can find most of the boundaries between areas in the cortex. Today, finding cortical areal boundaries in a tissue sample is a manual process that requires a skilled human to combine multiple visual cues over a large area of the cortical surface. A panel of marker genes will allow the development of an ISH protocol that will allow experimenters to more easily identify which anatomical areas are present in small samples of cortex.
   7.148 -
   7.149 -
   7.150 -
   7.151 -
   7.152 -
   7.153 -
   7.154 -
   7.155 -
   7.156 -
   7.157 -
   7.158 -
   7.159 -
   7.160 -For each cortical layer, a layer-specific flat dataset will be created. A single combined flat dataset will be created which averages information from all of the layers. These datasets will be made available in both MATLAB and Caret formats.
   7.161 -
   7.162 -
   7.163 -
   7.164 -
   7.165 -
   7.166 -
   7.167 -
   7.168 -
   7.169 -----
   7.170 -
   7.171 -
   7.172 -
   7.173 -New techniques allow the expression levels of many genes at many locations to be compared. It is thought that even neighboring anatomical structures have different gene expression profiles. We propose to develop automated methods to relate the spatial variation in gene expression to anatomy. We will develop two kinds of techniques:
   7.174 -
   7.175 -(a) techniques to screen for combinations of marker genes which selectively target anatomical structures
   7.176 -(b) techniques to suggest new ways of dividing a structure up into anatomical subregions, based on the shapes of contours in the gene expression
   7.177 -
   7.178 -The first kind of technique will be helpful for finding marker genes associated with known anatomical features. The second kind of technique will be helpful in creating new anatomical maps, maps which reflect differences in gene expression the same way that existing maps reflect differences in histology.
   7.179 -
   7.180 -We intend to develop our techniques using the adult mouse cerebral cortex as a testbed. The Allen Brain Atlas has collected a dataset containing the expression level of about 4000 genes* over a set of over 150000 voxels, with a spatial resolution of approximately 200 microns\cite{lein_genome-wide_2007}. 
   7.181 -
   7.182 -We expect to discover sets of marker genes that pick out specific cortical areas. This will allow the development of drugs and other interventions that selectively target individual cortical areas. Therefore our research will lead to application in drug discovery, in the development of other targeted clinical interventions, and in the development of new experimental techniques.
   7.183 -
   7.184 -The best way to divide up rodent cortex into areas has not been completely determined, as can be seen by the differences in the recent maps given by Swanson on the one hand, and Paxinos and Franklin on the other. It is likely that our study, by showing which areal divisions naturally follow from gene expression data, as opposed to traditional histological data, will contribute to the creation of a better map. While we do not here propose to analyze human gene expression data, it is conceivable that the methods we propose to develop could be used to suggest modifications to the human cortical map as well.  
   7.185 -
   7.186 -
   7.187 -In the following, we will only be talking about coronal data.
   7.188 -
   7.189 -The Allen Brain Atlas provides "Smoothed Energy Volumes", which are 
   7.190 -
   7.191 -
   7.192 -One type of artifact in the Allen Brain Atlas data is what we call a "slice artifact". We have noticed two types of slice artifacts in the dataset. The first type, a "missing slice artifact", occurs when the ISH procedure on a slice did not come out well. In this case, the Allen Brain investigators excluded the slice at issue from the dataset. This means that no gene expression information is available for that gene for the region of space covered by that slice. This results in an expression level of zero being assigned to voxels covered by the slice. This is partially but not completely ameliorated by the smoothing that is applied to create the Smoothed Energy Volumes. The usual end result is that a region of space which is shaped and oriented like a coronal slice is marked as having less gene expression than surrounding regions.
   7.193 -
   7.194 -The second type of slice artifact is caused by the fact that all of the slices have a consistent orientation. Since there may be artifacts (such as how well the ISH worked) which are constant within each slice but which vary between different slices, the result is that ceteris paribus, when one compares the genetic data of a voxel to another voxel within the same coronal plane, one would expect to find more similarity than if one compared a voxel to another voxel displaced along the rostrocaudal axis.
   7.195 -
   7.196 -
   7.197 -
   7.198 -
   7.199 -We are enthusiastic about the sharing of methods, data, and results, and at the conclusion of the project, we will make all of our data and computer source code publically available. Our goal is that replicating our results, or applying the methods we develop to other targets, will be quick and easy for other investigators. In order to aid in understanding and replicating our results, we intend to include a software program which, when run, will take as input the Allen Brain Atlas raw data, and produce as output all numbers and charts found in publications resulting from the project.
   7.200 -
   7.201 -
   7.202 -To aid in the replication of our results, we will include a script which takes as input the dataset in aim (3) and provides as output all of the tables in figures in our publications .
   7.203 -
   7.204 -
   7.205 -
   7.206 -
   7.207 -We also expect to weigh in on the debate about how to best partition rodent cortex
   7.208 -
   7.209 -
   7.210 -
   7.211 -be useful for drug discovery as well 
   7.212 -
   7.213 -
   7.214 -
   7.215 -* Another 16000 genes are available, but they do not cover the entire cerebral cortex with high spatial resolution.
   7.216 -
   7.217 -
   7.218 -User-definable ROIs
   7.219 -Combinatorial gene expression
   7.220 -Negative as well as positive signal
   7.221 -Use geometry
   7.222 -Search for local boundaries if necessary
   7.223 -Flatmapped
   7.224 -
   7.225 -
   7.226 -
   7.227 -
   7.228 -
   7.229 -
   7.230 -== Specific aims ==
   7.231 +==== Areas which can be identified by single genes ====
   7.232 +todo
   7.233 +
   7.234 +
   7.235 +=== Aim 1 (and Aim 3) ===
   7.236 +
   7.237 +
   7.238 +==== SVM on all genes at once ====
   7.239 +In order to see how well one can do when looking at all genes at once, we ran a support vector machine to classify cortical surface pixels based on their gene expression profiles. We achieved classification accuracy of about 81%\footnote{Using the Shogun SVM package (todo:cite), with parameters type=GMNPSVM (multiclass b-SVM), kernal = gaussian with sigma = 0.1, c = 10, epsilon = 1e-1 -- these are the first parameters we tried, so presumably performance would improve with different choices of parameters. 5-fold cross-validation.}. As noted above, however, a classifier that looks at all the genes at once isn't practically useful. 
   7.240 +
   7.241 +The requirement to find combinations of only a small number of genes limits us from straightforwardly applying many of the most simple techniques from the field of supervised machine learning. In the parlance of machine learning, our task combines feature selection with supervised learning.
   7.242 +
   7.243 +
   7.244 +==== Decision trees ====
   7.245 +todo
   7.246 +
   7.247 +
   7.248 +=== Aim 2 (and Aim 3) ===
   7.249 +
   7.250 +=== Raw dimensionality reduction results ===
   7.251 +
   7.252 +
   7.253 +=== Dimensionality reduction plus K-means or spectral clustering ===
   7.254 +
   7.255 +
   7.256 +==== Many areas are captured by clusters of genes ====
   7.257 +todo
   7.258 +
   7.259 +
   7.260 +
   7.261 +
   7.262 +
   7.263 +
   7.264 +
   7.265 +
   7.266 +
   7.267 +
   7.268 +
   7.269 +todo
   7.270 +
   7.271 +== Research plan ==
   7.272 +
   7.273 +todo
   7.274 +
   7.275 +amongst other thigns:
   7.276  
   7.277  ==== Develop algorithms that find genetic markers for anatomical regions ====
   7.278  # Develop scoring measures for evaluating how good individual genes are at marking areas: we will compare pointwise, geometric, and information-theoretic measures.
   7.279 @@ -275,316 +227,15 @@
   7.280  
   7.281  
   7.282  
   7.283 -
   7.284 -
   7.285 -
   7.286 -
   7.287 -gradient similarity is calculated as:
   7.288 -\sum_pixels cos(abs(\angle \nabla_1 - \angle \nabla_2)) \cdot \frac{\vert \nabla_1 \vert + \vert \nabla_2 \vert}{2}  \cdot \frac{pixel\_value_1 + pixel\_value_2}{2}
   7.289 -
   7.290 -
   7.291 -
   7.292 -
   7.293 -
   7.294 -
   7.295 -
   7.296 -(todo) Technically, we say that an anatomical structure has a fundamentally 2-D organization when there exists a commonly used, generic, anatomical structure-preserving map from 3-D space to a 2-D manifold.  
   7.297 -
   7.298 -
   7.299 -Related work:
   7.300 -
   7.301 -
   7.302 -The Allen Brain Institute has developed an interactive web interface called AGEA which allows an investigator to (1) calculate lists of genes which are selectively overexpressed in certain anatomical regions (ABA calls this the "Gene Finder" function) (2) to visualize the correlation between the genetic profiles of voxels in the dataset, and (3) to visualize a hierarchial clustering of voxels in the dataset \cite{ng_anatomic_2009}. AGEA is an impressive and useful tool, however, it does not solve the same problems that we propose to solve with this project.
   7.303 -
   7.304 -First we describe AGEA's "Gene Finder", and then compare it to our proposed method for finding marker genes. AGEA's Gene Finder first asks the investigator to select a single "seed voxel" of interest. It then uses a clustering method, combined with built-in knowledge of major anatomical structures, to select two sets of voxels; an "ROI" and a "comparator region"*. The seed voxel is always contained within the ROI, and the ROI is always contained within the comparator region. The comparator region is similar but not identical to the set of voxels making up the major anatomical region containing the ROI. Gene Finder then looks for genes which can distinguish the ROI from the comparator region. Specifically, it finds genes for which the ratio (expression energy in the ROI) / (expression energy in the comparator region) is high. 
   7.305 -
   7.306 -Informally, the Gene Finder first infers an ROI based on clustering the seed voxel with other voxels. Then, the Gene Finder finds genes which overexpress in the ROI as compared to other voxels in the major anatomical region. 
   7.307 -
   7.308 -There are three major differences between our approach and Gene Finder. 
   7.309 -
   7.310 -First, Gene Finder focuses on individual genes and individual ROIs in isolation. This is great for regions which can be picked out from all other regions by a single gene, but not all of them can (todo). There are at least two ways this can miss out on useful genes. First, a gene might express in part of a region, but not throughout the whole region, but there may be another gene which expresses in the rest of the region*. Second, a gene might express in a region, but not in any of its neighbors, but it might express also in other non-neighboring regions. To take advantage of these types of genes, we propose to find combinations of genes which, together, can identify the boundaries of all subregions within the containing region.
   7.311 -
   7.312 -Second, Gene Finder uses a pointwise metric, namely expression energy ratio, to decide whether a gene is good for picking out a region. We have found better results by using metrics which take into account not just single voxels, but also the local geometry of neighboring voxels, such as the local gradient (todo). In addition, we have found that often the absence of gene expression can be used as a marker, which will not be caught by Gene Finder's expression energy ratio (todo).
   7.313 -
   7.314 -Third, Gene Finder chooses the ROI based only on the seed voxel. This often does not permit the user to query the ROI that they are interested in. For example, in all of our tests of Gene Finder in cortex, the ROIs chosen tend to be cortical layers, rather than cortical areas.
   7.315 -
   7.316 -In summary, when Gene Finder picks the ROI that you want, and when this ROI can be easily picked out from neighboring regions by single genes which selectively overexpress in the ROI compared to the entire major anatomical region, Gene Finder will work. However, Gene Finder will not pick cortical areas as ROIs, and even if it could, many cortical areas cannot be uniquely picked out by the overexpression of any single gene. By contrast, we will target cortical areas, we will explore a variety of metrics which can complement the shortcomings of expression energy ratio, and we will use the combinatorial expression of genes to pick out cortical areas even when no individual gene will do.
   7.317 -
   7.318 -
   7.319 -* The terms "ROI" and "comparator region" are our own; the ABI calls them the "local region" and the "larger anatomical context". The ABI uses the term "specificity comparator" to mean the major anatomic region containing the ROI, which is not exactly identical to the comparator region.
   7.320 -
   7.321 -** In this case, the union of the area of expression of the two genes would suffice; one could also imagine that there could be situations in which the intersection of multiple genes would be needed, or a combination of unions and intersections.
   7.322 -
   7.323 -
   7.324 -Now we describe AGEA's hierarchial clustering, and compare it to our proposal. The goal of AGEA's hierarchial clustering is to generate a binary tree of clusters, where a cluster is a collection of voxels. AGEA begins by computing the Pearson correlation between each pair of voxels. They then employ a recursive divisive (top-down) hierarchial clustering procedure on the voxels, which means that they start with all of the voxels, and then they divide them into clusters, and then within each cluster, they divide that cluster into smaller clusters, etc***. At each step, the collection of voxels is partitioned into two smaller clusters in a way that maximizes the following quantity: average correlation between all possible pairs of voxels containing one voxel from each cluster. 
   7.325 -
   7.326 -There are three major differences between our approach and AGEA's hierarchial clustering. First, AGEA's clustering method separates cortical layers before it separates cortical areas. 
   7.327 -
   7.328 -
   7.329 -
   7.330 -
   7.331 -
   7.332 -following procedure is used for the purpose of dividing a collection of voxels into smaller clusters: partition the voxels into two sets, such that the following quantity is maximized: 
   7.333 -
   7.334 -*** depending on which level of the tree is being created, the voxels are subsampled in order to save time
   7.335 -
   7.336 -
   7.337 -
   7.338 -
   7.339 -
   7.340 -does not allow the user to input anything other than a seed voxel; this means that for each seed voxel, there is only one 
   7.341 -
   7.342 -
   7.343 -
   7.344 -The role of the "local region" is to serve as a region of interest for which marker genes are desired; the role of the "larger anatomical context" is to be the structure 
   7.345 -
   7.346 -
   7.347 -
   7.348 -There are two kinds of differences between AGEA and our project; differences that relate to the treatment of the cortex, and differences in the type of generalizable methods being developed. As relates 
   7.349 -
   7.350 -
   7.351 -indicate an ROI  
   7.352 -
   7.353 -explore simple correlation-based relationships between voxels, genes, and clusters of voxels. 
   7.354 -
   7.355 -
   7.356 -There have not yet been any studies which describe the results of applying AGEA to the cerebral cortex; however, we suspect that the AGEA metrics are not optimal for the task of relating genes to cortical areas. A voxel's gene expression profile depends upon both its cortical area and its cortical layer, however, AGEA has no mechanism to distinguish these two. As a result, voxels in the same layer but different areas are often clustered together by AGEA. As part of the project, we will compare the performance of our techniques against AGEA's.
   7.357 -
   7.358 ----
   7.359 -
   7.360 -The Allen Brain Institute has developed interactive tools called AGEA which allow an investigator to explore simple correlation-based relationships between voxels, genes, and clusters of voxels. There have not yet been any studies which describe the results of applying AGEA to the cerebral cortex; however, we suspect that the AGEA metrics are not optimal for the task of relating genes to cortical areas. A voxel's gene expression profile depends upon both its cortical area and its cortical layer, however, AGEA has no mechanism to distinguish these two. As a result, voxels in the same layer but different areas are often clustered together by AGEA. As part of the project, we will compare the performance of our techniques against AGEA's.
   7.361 -
   7.362 -Another difference between our techniques and AGEA's is that AGEA allows the user to enter only a voxel location, and then to either explore the rest of the brain's relationship to that particular voxel, or explore a partitioning of the brain based on pairwise voxel correlation. If the user is interested not in a single voxel, but rather an entire anatomical structure, AGEA will only succeed to the extent that the selected voxel is a typical representative of the structure. As discussed in the previous paragraph, this poses problems for structures like cortical areas, which (because of their division into cortical layers) do not have a single "typical representative".
   7.363 -
   7.364 -By contrast, in our system, the user will start by selecting, not a single voxel, but rather, an anatomical superstructure to be divided into pieces (for example, the cerebral cortex). We expect that our methods will take into account not just pairwise statistics between voxels, but also large-scale geometric features (for example, the rapidity of change in gene expression as regional boundaries are crossed) which optimize the discriminability of regions within the selected superstructure.
   7.365 -
   7.366 -
   7.367 ------
   7.368 -
   7.369 -screen for combinations of marker genes which selectively target anatomical structures
   7.370 -pick delineate the boundaries between neighboring anatomical structures.
   7.371 -(b) techniques to screen for marker genes which pick out anatomical structures of interest
   7.372 -
   7.373 -, techniques which: (a) screen for marker genes , and (b) suggest new anatomical maps based on 
   7.374 -
   7.375 -
   7.376 -whose expression partitions the region of interest into its anatomical substructures, and (b) use the natural contours of gene expression to suggest new ways of dividing an organ into 
   7.377 -
   7.378 -
   7.379 -The Allen Brain Atlas
   7.380 -
   7.381 -
   7.382 -
   7.383 -
   7.384 ---
   7.385 -
   7.386 -to: brooksl@mail.nih.gov
   7.387 -
   7.388 -Hi, I'm writing to confirm the applicability of a potential research
   7.389 -project to the challenge grant topic "New computational and
   7.390 -statistical methods for the analysis of large
   7.391 -data sets from next-generation sequencing technologies".
   7.392 -
   7.393 -We want to develop methods for the analysis of gene expression
   7.394 -datasets that can be used to uncover the relationships between gene
   7.395 -expression and anatomical regions. Specifically, we want to develop
   7.396 -techniques to (a) given a set of known anatomical areas, identify
   7.397 -genetic markers for each of these areas, and (b) given an anatomical structure
   7.398 -whose substructure is unknown, suggest a map, that is, a division of
   7.399 -the space into anatomical sub-structures, that represents the
   7.400 -boundaries inherent in the gene expression data.
   7.401 -
   7.402 -We propose to develop our techniques on the Allen Brain
   7.403 -Atlas mouse brain gene expression dataset by finding genetic markers
   7.404 -for anatomical areas within the cerebral cortex. The Allen Brain Atlas
   7.405 -contains a registered 3-D map of gene expression data with 200-micron
   7.406 -voxel resolution which was created from in situ hybridization
   7.407 -data. The dataset contains about 4000 genes which are available at
   7.408 -this resolution across the entire cerebral cortex.
   7.409 -
   7.410 -Despite the distinct roles of different cortical
   7.411 -areas in both normal functioning and disease processes, there are no
   7.412 -known marker genes for many cortical areas. This project will be
   7.413 -immediately useful for both drug discovery and clinical research
   7.414 -because once the markers are known, interventions can be designed
   7.415 -which selectively target specific cortical areas.
   7.416 -
   7.417 -This techniques we develop will be useful because they will be
   7.418 -applicable to the analysis of other anatomical areas, both in
   7.419 -terms of finding marker genes for known areas, and in terms of
   7.420 -suggesting new anatomical subdivisions that are based upon the gene
   7.421 -expression data.
   7.422 -
   7.423 -
   7.424 -
   7.425  ----
   7.426  
   7.427 -
   7.428 -
   7.429 -
   7.430 -
   7.431 -
   7.432 -It is likely that our study, by showing which areal divisions naturally follow from gene expression data, as opposed to traditional histological data, will contribute to the creation of
   7.433 -
   7.434 -there are clear genetic or chemical markers known for only a few cortical areas. This makes it difficult to target drugs to specific
   7.435 -
   7.436 -As part of aims (1) and (5), we will discover sets of marker genes that pick out specific cortical areas. This will allow the development of drugs and other interventions that selectively target individual cortical areas. As part of aims (2) and (5), we will also discover small panels of marker genes that can be used to delineate most of the cortical areal map. 
   7.437 -
   7.438 -
   7.439 -
   7.440 -With aims (2) and (4), we
   7.441 -
   7.442 -There are five principals
   7.443 -
   7.444 -
   7.445 -
   7.446 -In addition to validating the usefulness of the algorithms, the application of these methods to cerebral cortex will produce immediate benefits that are only one step removed from clinical application. 
   7.447 -
   7.448 -
   7.449 -todo: remember to check gensat, etc for validation (mention bias/variance)
   7.450 -
   7.451 -
   7.452 -
   7.453 -=== Why it is useful to apply these methods to cortex ===
   7.454 -
   7.455 -
   7.456 -There is still room for debate as to exactly how the cortex should be parcellated into areas. 
   7.457 -
   7.458 -
   7.459 -The best way to divide up rodent cortex into areas has not been completely determined, 
   7.460 -
   7.461 -
   7.462 -not yet been accounted for in 
   7.463 -
   7.464 - that the expression of some genes will contain novel spatial patterns which are not account
   7.465 -
   7.466 - that a genoarchitectonic map
   7.467 -
   7.468 -
   7.469 -This principle is only applicable to aim 1 (marker genes). For aim 2 (partition a structure in into anatomical subregions), we plan to work with many genes at once. 
   7.470 -
   7.471 -
   7.472 -tood: aim 2 b+s?
   7.473 -
   7.474 -
   7.475 -
   7.476 -
   7.477 -==== Principle 5: Interoperate with existing tools ====
   7.478 -
   7.479 -In order for our software to be as useful as possible for our users, it will be able to import and export data to standard formats so that users can use our software in tandem with other software tools created by other teams. We will support the following formats: NIFTI (Neuroimaging Informatics Technology Initiative), SEV (Allen Brain Institute Smoothed Energy Volume), and MATLAB. This ensures that our users will not have to exclusively rely on our tools when analyzing data. For example, users will be able to use the data visualization and analysis capabilities of MATLAB and Caret alongside our software.
   7.480 -
   7.481 -To our knowledge, there is no currently available software to convert between these formats, so we will also provide a format conversion tool. This may be useful even for groups that don't use any of our other software.
   7.482 -
   7.483 -
   7.484 -
   7.485 -todo: is "marker gene" even a phrase that we should use at all?
   7.486 -
   7.487 -
   7.488 -
   7.489 -note for aim 1 apps: combo of genes is for voxel, not within any single cell
   7.490 -
   7.491 -
   7.492 -
   7.493 -
   7.494 -, as when genetic markers allow the development of selective interventions; the reason that one can be confident that the intervention is selective is that it is only turned on when a certain combination of genes is turned on and off. The result procedure is what assures us that when that combination is present, the local tissue is probably part of a certain subregion. 
   7.495 -
   7.496 -
   7.497 -
   7.498 -The basic idea is that we want to find a procedure by 
   7.499 -
   7.500 -The task of finding genes that mark anatomical areas can be phrased in terms of what the field of machine learning calls a "supervised learning" task. The goal of this task is to learn a function (the "classifier") which
   7.501 -
   7.502 -If a person knows a combination of genes that mark an area, that implies that the person can be told how strong those genes express in any voxel, and the person can use this information to determine how 
   7.503 -
   7.504 -finding how to infer the areal identity of a voxel if given the gene expression profile of that voxel.
   7.505 -
   7.506 - 
   7.507 -For each voxel in the cortex, we want to start with data about the gene expression 
   7.508 -
   7.509 -
   7.510 -
   7.511 -There are various ways to look for marker genes. We will define some terms, and along the way we will describe a few design choices encountered in the process of creating a marker gene finding method, and then we will present four principles that describe which options we have chosen.
   7.512 -
   7.513 -In developing a procedure for finding marker genes, we are developing a procedure that takes a dataset of experimental observations and produces a result. One can think of the result as merely a list of genes, but really the result is an understanding of a predictive relationship between, on the one hand, the expression levels of genes, and, on the other hand, anatomical subregions.
   7.514 -
   7.515 -One way to more formally define this understanding is to look at it as a procedure. In this view, the result of the learning procedure is itself a procedure. The result procedure provides a way to use the gene expression profiles of voxels in a tissue sample in order to determine where the subregions are.
   7.516 -
   7.517 -This result procedure can be used directly, as when an experimenter has a tissue sample and needs to know what subregions are present in it, and, if multiple subregions are present, where they each are. Or it can be used indirectly; imagine that the result procedure tells us that whenever a certain combination of genes are expressed, the local tissue is probably part of a certain subregion. This means that we can then confidentally develop an intervention which is triggered only when that combination of genes are expressed; and to the extent that the result procedure is reliable, we know that the intervention will only be triggered in the target subregion.
   7.518 -
   7.519 -We said that the result procedure provides "a way to use the gene expression profiles of voxels in a tissue sample" in order to "determine where the subregions are". 
   7.520 -
   7.521 -
   7.522 -Does the result procedure get as input all of the gene expression profiles of each voxel in the entire tissue sample, and produce as output all of the subregional boundaries all at once? 
   7.523 -
   7.524 -
   7.525 -
   7.526 -
   7.527 -
   7.528 -
   7.529 - it is helpful for the classifier to look at the global "shape" of gene expression patterns over the whole structure, rather than just nearby voxels.
   7.530 -
   7.531 -
   7.532 -
   7.533 -
   7.534 -there is some small bit of additional information that can be gleaned from knowing the 
   7.535 -
   7.536 -==== Design choices for a supervised learning procedure ====
   7.537 -
   7.538 -
   7.539 -After all, 
   7.540 -
   7.541 -there is a small correlation between the gene expression levels from distant voxels and
   7.542 -
   7.543 -Depending on how we intend to use the classifier, we may want to design it so that
   7.544 -
   7.545 -It is possible for many things to 
   7.546 -
   7.547 -The choice of which data is made part of an instance 
   7.548 -
   7.549 -what we seek is a procedure 
   7.550 -
   7.551 - partition the tissue sample into subregions. 
   7.552 -
   7.553 - each part of the anatomical structure 
   7.554 -
   7.555 - must be One way to rephrase this task is to say that, instead of searching for the location of the subregions, we are looking to partition the tissue sample into subregions. 
   7.556 -
   7.557 -
   7.558 -There are various ways to look for marker genes. We will define some terms, and along the way we will describe a few design choices encountered in the process of creating a marker gene finding method, and then we will present four principles that describe which options we have chosen.
   7.559 -
   7.560 -In developing a procedure for finding marker genes, we are developing a procedure that takes a dataset of experimental observations and produces a result. One can think of the result as merely a list of genes, but really the result is an understanding of a predictive relationship between, on the one hand, the expression levels of genes, and, on the other hand, anatomical subregions.
   7.561 -
   7.562 -One way to more formally define this understanding is to look at it as a procedure. In this view, the result of the learning procedure is itself a procedure. The result procedure provides a way to use the gene expression profiles of voxels in a tissue sample in order to determine where the subregions are.
   7.563 -
   7.564 -This result procedure can be used directly, as when an experimenter has a tissue sample and needs to know what subregions are present in it, and, if multiple subregions are present, where they each are. Or it can be used indirectly; imagine that the result procedure tells us that whenever a certain combination of genes are expressed, the local tissue is probably part of a certain subregion. This means that we can then confidentally develop an intervention which is triggered only when that combination of genes are expressed; and to the extent that the result procedure is reliable, we know that the intervention will only be triggered in the target subregion.
   7.565 -
   7.566 -We said that the result procedure provides "a way to use the gene expression profiles of voxels in a tissue sample" in order to "determine where the subregions are". 
   7.567 -
   7.568 -
   7.569 -Does the result procedure get as input all of the gene expression profiles of each voxel in the entire tissue sample, and produce as output all of the subregional boundaries all at once? 
   7.570 -
   7.571 -
   7.572 -Or are we given one voxel at a time, 
   7.573 -
   7.574 -
   7.575 -In the jargon of the field of machine learning, the result procedure is called a __classifier__.
   7.576 -
   7.577 -
   7.578 -The task of finding genes that mark anatomical areas can be phrased in terms of what the field of machine learning calls a "supervised learning" task. The goal of this task is to learn a function (the "classifier") which
   7.579 -
   7.580 -If a person knows a combination of genes that mark an area, that implies that the person can be told how strong those genes express in any voxel, and the person can use this information to determine how 
   7.581 -
   7.582 -finding how to infer the areal identity of a voxel if given the gene expression profile of that voxel.
   7.583 -
   7.584 - 
   7.585 -For each voxel in the cortex, we want to start with data about the gene expression 
   7.586 -
   7.587 -
   7.588 -
   7.589 -single voxels, but rather groups of voxels, such that the groups can be placed in some 2-D space. We will call such instances "pixels".
   7.590 -
   7.591 -We have been speaking as if instances necessarily correspond to single voxels. But it is possible for instances to be groupings of many voxels, in which case each grouping must be assigned the same label (that is, each voxel grouping must stay inside a single anatomical subregion).
   7.592 -
   7.593 -
   7.594 -
   7.595 -In some but not all cases, the groups are either rows or columns of voxels. This is the case with the cerebral cortex, in which one may assume that columns of voxels which run perpendicular to the cortical surface all share the same areal identity. In the cortex, we call such an instance a "surface pixel", because such an instance represents the data associated with all voxels underneath a specific patch of the cortical surface. 
   7.596 +stuff i dunno where to put yet (there is more scattered through grant-oldtext):
   7.597 +
   7.598 +==== Principle 4: Work in 2-D whenever possible ====
   7.599 +
   7.600 +In anatomy, the manifold of interest is usually either defined by a combination of two relevant anatomical axes (todo), or by the surface of the structure (as is the case with the cortex). In the former case, the manifold of interest is a plane, but in the latter case it is curved. If the manifold is curved, there are various methods for mapping the manifold into a plane.
   7.601 +
   7.602 +The method that we will develop will begin by mapping the data into a 2-D plane. Although the manifold that characterized cortical areas is known to be the cortical surface, it remains to be seen which method of mapping the manifold into a plane is optimal for this application. We will compare mappings which attempt to preserve size (such as the one used by Caret\ref{van_essen_integrated_2001}) with mappings which preserve angle (conformal maps). 
   7.603 +
   7.604 +Although there is much 2-D organization in anatomy, there are also structures whose shape is fundamentally 3-dimensional. If possible, we would like the method we develop to include a statistical test that warns the user if the assumption of 2-D structure seems to be wrong.
   7.605 +
