@misc{schapire-01,
  howpublished = {MSRI Workshop on Nonlinear Estimation and
                  Classification},
  year =	 2002,
  title =	 {The Boosting Approach to Machine Learning: An
                  Overview},
  author =	 {Robert Schapire},
  abstract =	 {Boosting is ageneral method for improving the
                  accuracy of any given learning algorithm. Focusing
                  primarily on the AdaBoost algorithm, this chapter
                  overviews some of the recent work on boosting
                  including analyses of AdaBoost's training error and
                  generalization error; boosting's connection to
                  gametheory and linear programming; the relationship
                  between boosting and logistic regression; extensions
                  of AdaBoost for multiclassclassification problems;
                  methods of incorporating human knowledge into
                  boosting; and experimental and applied work using
                  boosting. },
  entered_on =	 {03/17/04},
  pdf =		 {algorithms/schapire-01.pdf},
  comments =	 {A good review of AdaBoost.},
}


@article{tibshirani-96,
  title =	 {Regression Shrinkage and selection via the {L}ASSO},
  author =	 {Robert Tibshirani},
  journal =	 {J.\ Roy.\ Stat.\ Soc., Ser.\ B (Method.)},
  volume =	 {58},
  number =	 1,
  year =	 1996,
  pages =	 {267-288},
  entered_on =	 {02/15/04},
  pdf =		 {algorithms/tibshirani-96.pdf},
  abstract =	 {We propose a new method for estimation in linear
                  models. The 'lasso' minimizes the residual sum of
                  squares subject to the sum of the absolute value of
                  the coefficients being less than a constant. Because
                  of the nature of this constraints it tends to
                  produce some coefficients that are exactly 0 and
                  hence gives interpretable models. Our simulation
                  studies suggest tha the lasso enjoys some of the
                  favourable properties of both subset selction and
                  ridge regression. It produces interpretable models
                  like subset selection and exhibits the stability of
                  ridge regression. There is also an interesting
                  relationship with the recent work in adaptive
                  function estimation by Donoho and Johnstone. The
                  lasso idea is quite general and can be applied in a
                  variety of statistical models: extensions to
                  generalized regression models and tree-based models
                  are briefly described.},
  comments =	 {A conceptually clear and well--written paper. The
                  usual idea that one has (a-la Bayesian model
                  selection, or James--Stein shrinkage, or Donoho and
                  Johnstone, 1994) is to shrink some coefficents in,
                  say, simple linear models. This is a principled way
                  to proceed, since "shrinkage" of the coefficients
                  results from averaging over the Bayesian
                  prior. Instead, in lasso one starts we requiring
                  that sum of the absolute values of coefficients in
                  linear models is minimal (this is, by the way, very
                  similar to the standard SVM toolbox). In this case
                  many coeffiecients that would have usually be
                  "small" are shrunk exactly to zero. The minimization
                  problem for lasso has, therefore, the usual two-term
                  structure: goodness of fit (usual quadratic error),
                  and the complexity penalty (the sum of absolute
                  values of coefficients; other penalties also
                  considered as examples). Errors of the lasso
                  estimates are to be estimated by bootstrap. I think
                  that the absval constraint on parameters is not
                  strong enough to be a good "complexity penalty" in
                  an arbitrary case, when the number of predictor
                  variables is huge (possibly infinite). Fig. 1 in
                  this paper gives a good insight: for very large
                  dimensions, the probability that the ellipses of
                  maximum likelihood will hit the constraints
                  hypercube at points where many coefficients are zero
                  (verticies, ridges) becomes very small. Lasso will
                  eliminate many coefficients for
                  infinite--dimensional models, but it won't eliminate
                  most of them. This is in contrast to Bayesian case
                  \cite{nemenman-04}. It might me interesting to do
                  infinite dimensional analysis of lasso to analyze
                  this question. Note also that in this model one must
                  select the comparative weight of the goodness of fit
                  and the complexity term ("t" in the article) by some
                  other mean (e.g., bootstrap): lasso is not a
                  self-suffiecient machine.},
}

@article{efron-etal-03,
  title =	 {Least Angle Regression},
  journal =	 {Ann.\ Stat.},
  volume =	 31,
  year =	 2003,
  entered_on =	 {02/15/04},
  author =	 {Bradley Efron and Trevor Hastie and Iain Johnstone
                  and Robert Tibshirani},
  abstract =	 {The purpose of model selection algorithms such as
                  All Subsets, Forward Selection, and Backward
                  Elimination is to choose a linear model on the basis
                  of the same set of data to which the model will be
                  applied. Typically we have available a large
                  collection of possible covariates from which we hope
                  to select a parsimonious set for the efficient
                  prediction of a response variable. Least Angle
                  Regression ("LARS"), a new model selection
                  algorithm, is a useful and less greedy version of
                  traditional forward selection methods. Three main
                  properties are derived. (1)A simple modification of
                  the LARS algorithm implements the Lasso, an
                  attractive version of Ordinary Least Squares that
                  constrains the sum of the absolute regression
                  coefficients; the LARS modification calculates all
                  possible Lasso estimates for a given problem, using
                  an order of magnitude less computer time than
                  previous methods. (2)A different LARS modification
                  efficiently implements Forward Stagewise linear
                  regression, another promising new model selection
                  method; this connection explains the similar
                  numerical results previously observed for the Lasso
                  and Stagewise, and helps understand the properties
                  of both methods, which are seen as constrained
                  versions of the simpler LARS algorithm. (3) A simple
                  approximation for the degrees of freedom of a LARS
                  estimate is available, from which we derive a Cp
                  estimate of prediction error; this allows a
                  principled choice among the range of possible LARS
                  estimates. LARS and its variants are computationally
                  efficient: the paper describes a publicly available
                  algorithm that requires only the same order of
                  magnitude of computational efort as Ordinary Least
                  Squares applied to the full set of covariates.},
  pdf =		 {algorithms/efron-etal-03.pdf},
  comments =	 {The LARS algorithm is an "algorithm" -- a sequence
                  of steps than one has to undertake. It is not clear
                  which problem is being solved, which lagrangian is
                  being minimized. Thus it is very difficult (for me)
                  to read this paper. Few points (1) the algoithm has
                  connections to lasso, boosting, and forward
                  stagewise regression (See Fig. 5). The performance
                  of these methods is undistinuishable from each other
                  within statistical errors. So, basically, these are
                  all agorithm solving almost the same problem in
                  various algorithmic ways; Lasso has the best
                  explanation of what is really being done -- it has a
                  Lagrangian which is being minimized. Also looking at
                  Fig. 5 we see that (2) neither of these methods
                  (LARS, Lasso, stagewise) is capable of doing proper
                  model capacity control -- once the number of
                  parameters goes above the best, the models start to
                  have large prediction errors. So, together with the
                  algortithms, one still must use some external method
                  (bootstrap? crossvalidation?) for capacity
                  control. (3) Further, we see that, within
                  statistical error, the best possible performance of
                  a simple, unconstrained, regression is as good as of
                  all of these new methods; the best performance for
                  the OLS is actually reached with a lot smaller
                  number of predictors! This performance decays very
                  repidly as more terms are introduced, so one must
                  have a good capacity control mechanism build in --
                  but this is required for LARS and Co. anyway. Then
                  the question is: if, at their best, LARS and
                  derivatives are comparable to OLS, and all require
                  capacity control tools, why shouldn't we go with the
                  easier OLS? In any case, it has a great bayesian
                  model selection going for it!}
}

@article{dempster-etal-77,
  author =	 {A.~P.~Dempster and N.~M.~Laird and D.~B.~Rubin},
  journal =	 {J.\ Roy.\ Stat.\ Soc., Ser.\ B (Methodol.)},
  title =	 {Maximum likelihood from incomplete data via the {EM}
                  algorithm},
  volume =	 39,
  number =	 1,
  year =	 1977,
  pages =	 {1--38},
  entered_on =	 {04/08/03},
  url =		 {algorithms/dempster-laird-rubin-77.djv},
  abstract =	 {A broadly applicble algoritm for computing maximum
                  likelihood estimates from incomplete data is
                  presented at various levels of generality. Theory
                  showing the monotone beaviour of the likelihood and
                  the convergence of the algorithm is derived. Many
                  examples are sketched, including missing value
                  situations, applications to grouped, censored or
                  truncated data, finite mixture models, variance
                  component estimation, hyperparameter estimation,
                  iteratively reweighted least squares and factor
                  analysis.},
  comments =	 {This paper introduces EM algorithm. One of the most
                  notable examples it gives is about hyperparameter
                  estimation, which is roughly equivalent to Bayesian
                  model selection in today's language.},
}


@Article{hasselblad-69,
  author =	 {Victor Hasselblad},
  title =	 {Estimation of finite mixtures of distributions from
                  the exponential family},
  journal =	 {J.\ Amer.\ Stat.\ Assoc.},
  year =	 {1969},
  volume =	 {64},
  number =	 328,
  month =	 Dec,
  pages =	 {1459--1471},
  url =		 {algorithms/hasselblad-69.djv},
  entered_on =	 {04/09/03},
  abstract =	 {General ''successive substitutions'' iteration
                  equations are developed for obtaining estimates for
                  finite mixtures of ditributions from the xponential
                  family. These, in general, correspond to relative
                  maximums of the likelihood function. It is assumed
                  that the number of distributions is known, and that
                  the mixtures are from distributions of the same
                  type, but with different parameter values. The
                  particular equations for the Poisson, binomial, and
                  exponential distributions are given, as well as
                  examples of the results of the procedure for each
                  distribution. From the examples tried, it was
                  observed that the likelihood function increased at
                  each iteration. Graphs of the asymptotic variances
                  of the estimates are givenm and two sampling
                  experiments comparing estimates obtained by this
                  scheme with moment estimates are also given.},
  comments =	 {The method discussed in this paper is a precursor of
                  the EM algorithm, \cite{dempster-etal-77}. EM is
                  used to estimate the mixture parameters in this
                  paper.},
}

@article{kurdistani-etal-02,
  title =	 {Genome-wide binding map of the histone deacetylase
                  Rpd3 in yeast},
  author =	 {S Kurdistani and D Robyr and S Tavazoie and M
                  Grunstein},
  abstract =	 {We describe the genome-wide distribution of the
                  histone deacetylase and repressor Rpd3 and its
                  associated proteins Ume1 and Ume6 in Saccharomyces
                  cerevisiae. Using a new cross-linking protocol, we
                  found that Rpd3 binds upstream of many individual
                  genes and upstream of members of gene classes with
                  similar functions in anabolic processes. In
                  addition, Rpd3 is preferentially associated with
                  promoters that direct high transcriptional
                  activity. We also found that Rpd3 was absent from
                  large sub-telomeric domains. We show by
                  co-immunoprecipitation and by the high similarity of
                  their binding maps that Ume1 interacts with Rpd3. In
                  contrast, despite the known role of Ume6 in Rpd3
                  recruitment, only a limited number of the genes
                  targeted by Rpd3 are also enriched for (or targeted
                  by) Ume6. This suggests that Rpd3 is brought to many
                  promoters by alternative recruiters, some of which
                  may bind the putative cis-regulatory DNA elements
                  that we have identified in sets of Rpd3 target
                  genes. Finally, we show that comparing the
                  genome-wide pattern of Rpd3 binding with gene
                  expression and histone acetylation in the
                  rpd3Ćmutant strain reveals new sites of Rpd3
                  function.},
  journal =	 {Nat. Genet.},
  volume =	 31,
  year =	 2002,
  pages =	 {248-254},
  pdf =		 {bioinformatics/kurdistani-etal-02.pdf},
  entered_on =	 {08/05/05},
  comments =	 {},
}


@article{ronen-etal-02,
  title =	 {Assigning numbers to the arrows: Parameterizing a
                  gene regulation network by using accurate expression
                  kinetics},
  author =	 {M Ronen and R Rosenberg and B Shraiman and U Alon},
  abstract =	 {A basic challenge in systems biology is to
                  understand the dynamical behavior of gene regulation
                  networks. Current approaches aim at determining the
                  network structure based on genomic-scale
                  data. However, the network connectivity alone is not
                  sufficient to define its dynamics; one needs to also
                  specify the kinetic parameters for the regulation
                  reactions. Here, we ask whether effective kinetic
                  parameters can be assigned to a transcriptional
                  network based on expression data. We present a
                  combined experimental and theoretical approach based
                  on accurate high temporal-resolution measurement of
                  promoter activities from living cells by using green
                  fluorescent protein (GFP) reporter plasmids. We
                  present algorithms that use these data to assign
                  effective kinetic parameters within a mathematical
                  model of the network. To demonstrate this, we employ
                  a well defined network, the SOS DNA repair system of
                  Escherichia coli. We find a strikingly detailed
                  temporal program of expression that correlates with
                  the functional role of the SOS genes and is driven
                  by a hierarchy of effective kinetic parameter
                  strengths for the various promoters. The calculated
                  parameters can be used to determine the kinetics of
                  all SOS genes given the expression profile of just
                  one representative, allowing a significant reduction
                  in complexity. The concentration profile of the
                  master SOS transcriptional repressor can be
                  calculated, demonstrating that relative protein
                  levels may be determined from purely transcriptional
                  data. This finding opens the possibility of
                  assigning kinetic parameters to transcriptional
                  networks on a genomic scale.},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  year =	 2002,
  volume =	 99,
  number =	 16,
  pages =	 {10555-10560},
  entered_on =	 {08/04/05},
  comments =	 {The authors measure expression of RNA's of single
                  genes in the SOS module to high precision and then
                  fit the measurements to Michaelis-Menten kinetic for
                  both the expression and the involved protein
                  level. This way they find out all kinetic constants
                  (but, recall, they start from the known topology of
                  the network. Nothing extraordinary here.},
  pdf =		 {bioinformatics/ronen-etal-02.pdf},
}
@article{mcadams-arkin-98m,
  title =	 {simulation of prokaryotic gene circuits},
  author =	 {H McAdams and A Arkin},
  abstract =	 {Biochemical and genetic approaches have identified
                  the molecular mechanisms of many genetic reactions,
                  particularly in bacteria. Now a comparably detailed
                  understanding is needed of how groupings of genes
                  and related protein reactions interact to
                  orchestrate cellular functions over the cell cycle,
                  to implement preprogrammed cellular development, or
                  to dynamically change a cell's processes and
                  structures in response to environmental
                  signals. Simulations using realistic,
                  molecular-level models of genetic mechanisms and of
                  signal transduction networks are needed to analyze
                  dynamic behavior of multigene systems, to predict
                  behavior of mutant circuits, and to identify th
                  edesign principles applicable to design of genetic
                  regulatory circuits. When the underlying design
                  rules for regulatory circuits are understood, it
                  will be far easier to recognize common circuit
                  motifs, to identify functions of individual proteins
                  in regulation, and to redesign circuits for altered
                  functions. },
  journal =	 {Annu. Rev. Biophys. Biomol. Struct.},
  year =	 1998,
  volume =	 27,
  pages =	 {199-224},
  comments =	 {Good introductory review of things from
                  biochemistry, noise, to chemotaxis, lambda-phage
                  switch, cell cycle. Good literature collection circa
                  1988.},
  pdf =		 {bioinformatics/mcadams_arkin_98.pdf},
  entered_on =	 {08/01/05},
}

@article{yeung-02,
  title =	 {Reverse engineering of gene networks using singular
                  value decomposition and robust regression},
  author =	 {MKS Yeung and J Tegner and JJ Collins},
  abstract =	 {We propose a scheme to reverse-engineer gene
                  networks on a genome-wide scale using a relatively
                  small amount of gene expression data from microarray
                  experiments. Our method is based n the empirical
                  observation that such networks are typically large
                  and sparse. it uses singular value decomposition to
                  construct a family of candidate solutions and then
                  uses robust regression to identify the solution with
                  the smallest number of connections as the most
                  likely solution. Our algorithm has O(logN) sampling
                  complexty and O(N^4) computational complexity. we
                  test and validate our approach in a series of in
                  numero experiments on model gene networks.},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  volume =	 99,
  year =	 2002,
  number =	 9,
  pages =	 {6163--6168},
  pdf =		 {bioinformatics/yeung-tegner-collins-02.pdf},
  entered_on =	 {08/02/05},
  comments =	 {See also \cite{alter-etal-00}. This is relevant to
                  \cite{wiggins-nemenman-03} -- same linearization of
                  the dynamics near the steady state. They use L1
                  regression (Lasso) as a complexity control
                  mechanism. This is questionable in higher dimensions
                  (see my comments on Lasso), since this complexity
                  control measure leads to a fraction of all possible
                  parameters to be nonzero, not a low power of the
                  number of the parameters. The key point of the paper
                  is the linearization of the dynamics near the steady
                  state after a small transient perturbation. Nicely,
                  one can reconstruct networks this way element by
                  element, focusing on a netwrok around a single gene
                  without analyzing the whole networks. In many
                  respects, ARACNE is the generalization of this
                  algorithms without the linearity assumption and with
                  stricter capacity control mechanisms.},
}

@article{li-97,
  author =	 {W Li},
  title =	 {The Complexity of {DNA}: {T}he measure of
                  compositional heterogeneity in DNA sequences and
                  measures of complexity.},
  journal =	 {Complexity},
  volume =	 3,
  number =	 2,
  pages =	 {33-37},
  entered_on =	 {08/01/05},
  pdf =		 {bioinformatics/li_97.pdf},
  comments =	 {Long range correlation in DNA. Potentially useful
                  for \cite{bnt-01} since defines some arbitrary
                  complexity measure.},
  year =	 1997,
} 

@article{li-etal-94,
  author =	 {W Li and T Marr and K Kaneko},
  year =	 1994,
  title =	 "Understanding long-range correlations in DNA
                  sequences",
  journal =	 {Physica D (Nonlinearity)},
  volume =	 75,
  pages =	 {392-416},
  pdf =		 {bioinformatics/li_marr_koneko_94.pdf},
  note =	 {erratum: Physica D, 82:217 (1995)},
  entered_on =	 {08/01/05},
  commentrs =	 {Long range correelations and 1/f spectra in genomic
                  data.},
  abstract =	 {In this paper, we review the literature on
                  statistical long-range correlation in DNA
                  sequences. We examine the current evidence for these
                  correlations, and conclude that a mixture of many
                  length scales (including some relatively long ones)
                  in DNA sequences is responsible for the observed
                  1/f-like spectral component. We note the complexity
                  of the correlation structure in DNA sequences. The
                  observed complexity often makes it hard, or
                  impossible, to decompose the sequence into a few
                  statistically stationary regions. We suggest that,
                  based on the complexity of DNA sequences, a fruitful
                  approach to understand long-range correlation is to
                  model duplication, and other rearrangement
                  processes, in DNA sequences. One model, called
                  "expansion-modification system", contains only point
                  duplication and point mutation. Though simplistic,
                  this model is able to generate sequences with 1/f
                  spectra. We emphasize the importance of DNA
                  duplication in its contribution to the observed
                  long-range correlation in DNA sequences. },
}
@article{alter-etal-00,
  title =	 {Singular value decomposition for genome-wide
                  expression data processing and modeling},
  author =	 {O Alter and PO Brown and D Botstein},
  abstract =	 {We describe the use of singular value decomposition
                  in transforming genome-wide expression data from
                  genes   arrays space to reduced diagonalized
                  ÔÔeigengenesŐŐ   ÔÔeigenarraysŐŐ space, where the
                  eigengenes (or eigenarrays) are unique orthonormal
                  superpositions of the genes (or arrays). Normalizing
                  the data by filtering out the eigengenes (and
                  eigenarrays) that are inferred to represent noise or
                  experimental artifacts enables meaningful comparison
                  of the expression of different genes across
                  different arrays in different experiments. Sorting
                  the data according to the eigengenes and eigenarrays
                  gives a global picture of the dynamics of gene
                  expression, in which individual genes and arrays
                  appear to be classified into groups of similar
                  regulation and function, or similar cellular state
                  and biological phenotype, respectively. After
                  normalization and sorting, the significant
                  eigengenes and eigenarrays can be associated with
                  observed genome-wide effects of regulators, or with
                  measured samples, in which these regulators are
                  overactive or underactive, respectively.},
  entered_on =	 {08/01/05},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  year =	 2000,
  volume =	 97,
  number =	 18,
  pages =	 {10101-10106},
  pdf =		 {bioinformatics/alter-etal-00.pdf},
  comments =	 {One of the first microarray SVD papers; a few
                  datasets are analyzed, it's shown that few
                  components capture most of the variance.},
}


@article{nowicka-etal-01,
  note =	 {arXiv: cond-mat/0102348},
  title =	 {Long-Tail Feature of DNA Words Over- and
                  Under-Representation in Coding Sequences},
  author =	 {A Nowicka and M Dudek and S Cebrat and M Kowalczuk
                  and P Mackiewicz and M Dudkiewicz and D Szczepanik},
  journal =	 {Comp. Meth. Sci. Technol.},
  volume =	 6,
  pages =	 {65--71},
  year =	 2000,
  abstract =	 {We have analyzed DNA sequences of known genes from
                  16 yeast chromosomes (Saccharomyces cerevisiae) in
                  terms of oligonucleotides. We have noticed that the
                  relative abundances of oligonucleotide usage in the
                  genome follow a long-tail Levy-like distribution. We
                  have observed that long genes often use strongly
                  over-represented and under-represented nucleotides,
                  whereas it was not the case for the short genes
                  (shorter than 300 nucleotides) under
                  consideration. If selection on the extremely
                  over-represented/under-represented oligonucleotides
                  was strong, long genes would be more affected by
                  spontaneous mutations than short ones.},
  comments =	 {Good quick review of statistics of nucleotide
                  frequencies. Lefi flights, Zipf law in nucleotide
                  frequencies.},
  entered_on =	 {08/01/05},
  pdf =		 {bioinformatics/nowicka-etal-01.pdf},
}

@misc{chattopadhyay-etal-01,
  howpublished = {arXiv: physics/0102043},
  title =	 {Statistical Approach to Gene Evolution},
  abstract =	 {The evolution in coding DNA sequences brings new
                  flexibility and freedom to the codon words, even as
                  the underlying nucleotides get significantly
                  ordered. These curious contra-rules of gene
                  organisation are observed from the distribution of
                  words and the second moments of the nucleotide
                  letters. These statistical data give us the physics
                  behind the classification of bacteria.},
  author =	 {S Chattopadhyay and W Kanner and J Chakrabarti},
  entered_on =	 {08/01/05},
  comments =	 {Zipf law, etc., in codon distribution.}
}

@article{holter-etal-01,
  title =	 {Dynamic modeling of gene expression data},
  author =	 {N Holter and A Maritan  and M Cieplak and N Fedoroff
                  and J Banavar},
  abstract =	 {We describe the time evolution of gene expression
                  levels by using a time translational matrix to
                  predict future expression levels of genes based on
                  their expression levels at some initial time. We
                  deduce the time translational matrix for previously
                  published DNA microarray gene expression data sets
                  by modeling them within a linear framework by using
                  the characteristic modes obtained by singular value
                  decomposition. The resulting time translation matrix
                  provides a measure of the relationships among the
                  modes and governs their time evolution. We show that
                  a truncated matrix linking just a few modes is a
                  good approximation of the full time translation
                  matrix. This finding suggests that the number of
                  essential connections among the genes is small.},
  journal =	 {Proc. Natl. Acad. Sci},
  year =	 2001,
  volume =	 98,
  number =	 4,
  pages =	 {1693--1698},
  pdf =		 {bioinformatics/holter-etal-01.pdf},
  entered_on =	 {08/01/05},
  comments =	 {Pretty similar to \cite{wiggins-nemenman-03}. This
                  paper doesn't much deal with overfitting, and I
                  think they overfit.},
}


@article{brown-etal-99,
  title =	 {Knowledge-based analysis of microarray gene
                  expression data by using support vector machines},
  author =	 {M Brown and WN Grundy and D Lin and N Cristianini
                  and CW Sugnet T Furey and M Ares and D Haussler},
  abstract =	 {We introduce a method of functionally classifying
                  genes by using gene expression data from DNA
                  microarray hybridization experiments. The method is
                  based on the theory of support vector machines
                  (SVMs). SVMs are considered a supervised computer
                  learning method because they exploit prior knowledge
                  of gene function to identify unknown genes of
                  similar function from expression data. SVMs avoid
                  several problems associated with unsupervised
                  clustering methods, such as hierarchical clustering
                  and self-organizing maps. SVMs have many
                  mathematical features that make them attractive for
                  gene expression analysis, including their
                  flexibility in choosing a similarity function,
                  sparseness of solution when dealing with large data
                  sets, the ability to handle large feature spaces,
                  and the ability to identify outliers. We test
                  several SVMs that use different similarity metrics,
                  as well as some other supervised learning methods,
                  and find that the SVMs best identify sets of genes
                  with a common function using expression
                  data. Finally, we use SVMs to predict functional
                  roles for uncharacterized yeast ORFs based on their
                  expression data.},
  pages =	 {262-267},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  year =	 2000,
  volume =	 97,
  number =	 1,
  entered_on =	 {08/01/05},
  pdf =		 {bioinformatics/brown-etal-00.pdf},
  comments =	 {Uses SVMs to classify genes by their expressions,
                  where the training set consists of genes of known
                  function (belonging to the same functional class).},
}

@misc{gorban-etal-01,
  howpublished = {arXiv: physics/0108016},
  title =	 {Self-organizing Approach for Automated Gene
                  Identification in Whole Genomes},
  author =	 {AN Gorban and A Zinovyev and T Popova},
  abstract =	 {An approach based on using the idea of distinguished
                  coding phase in explicit form for identification of
                  protein-coding regions (exons) in whole genome has
                  been proposed. For several genomes an optimal window
                  length for averaging GC-content function and
                  calculating codon frequencies has been
                  found. Self-training procedure based on clustering
                  in multidimensional space of triplet frequencies is
                  proposed. For visualization of data in the space of
                  triplet requiencies method of elastic maps was
                  applied.},
  entered_on =	 {08/01/05},
  pdf =		 {bioinformatics/gorban-etal-01.pdf},
  comments =	 {Use GC content and the similarity between one-base
                  shifted frequencies of bases to distinguish coding
                  regions. Isn't this not a problem any more?},
  url =		 {http://arxiv.org/abs/physics/0108016},
}

@article{friedman-etal-00,
  title =	 {Using Bayesian Networks to Analyze Expression Data},
  author =	 {N Friedman and M Linial and I Nachman and D Peer},
  abstract =	 {DNA hybridization arrays simultaneously measure the
                  expression level for thousands of genes. These
                  measurements provide a ŇsnapshotÓ of the cellŐs
                  transcriptions. A major challenge in computational
                  biology is to uncover, from such measurements,
                  gene/protein interactions and key biological
                  features of the cellular system. In this paper, we
                  propose a new framework for discovering interactions
                  between genes based on multiple expression
                  measurements. This framework builds on the use of
                  Bayesian networks for representing statistical
                  dependencies. A Bayesian network is a graphical
                  model of joint multivariate probability
                  distributions that captures properties of
                  conditional independence between variables. Such
                  models are attractive for their ability to describe
                  complex stochastic processes, and for providing
                  clear methodologies for learning from (noisy)
                  observations. We start by showing how Bayesian
                  networks can describe interactions between genes. We
                  then present an efficient algorithm capable of
                  learning such networks and a statistical method to
                  assess our confidence in their features. Finally, we
                  apply this method to the S. cerevisae cell-cycle
                  measurements of Spellman et al. (1998) to uncover
                  biological features. },
  pdf =		 {bioinformatics/friedman-etal-00.pdf},
  comments =	 {Standard Bayes Nets for expression data; one of the
                  first such papers.},
  journal =	 {J. Comput. Biol.},
  year =	 2000,
  volume =	 7,
  number =	 {3-4},
  pages =	 {601-620},
}

@inproceedings{nachman-etal-05,
  title =	 {Inferring Quantitative Models of Regulatory Networks
                  From Expression Data},
  author =	 {I Nachman and A Regev and N Friedman},
  abstract =	 {Motivation: Genetic networks regulate key processes
                  in living cells. Various methods have been suggested
                  to reconstruct network architecture from gene
                  expression data. However, most approaches are based
                  on qualitative models that provide only rough
                  approximations of the underlying events, and lack
                  the quantitative aspects that are critical for
                  understanding the proper function of biomolecular
                  systems. <p>Results: We present fine-grained
                  dynamical models of gene transcription and develop
                  methods for reconstructing them from gene expression
                  data within the framework of a generative
                  probabilistic model. Unlike previous works, we
                  employ quantitative transcription rates, and
                  simultaneously estimate both the kinetic parameters
                  that govern these rates, and the activity levels of
                  unobserved regulators that control them. We apply
                  our approach to expression data sets from yeast and
                  show that we can learn the unknown regulator
                  activity profiles, as well as the binding affinity
                  parameters. We also introduce a novel structure
                  learning algorithm, and demonstrate its power to
                  accurately reconstruct the regulatory network from
                  those data sets.},
  booktitle =	 {Proc. ISMB'04},
  year =	 2005,
  entered_on =	 {02/02/05},
  comments =	 {Models gene expression asproduction due to activity
                  of some regulators (activated TFs), and mRNA
                  decay. Algorithm fits for a small number of
                  regulators with some time profiles that can explain
                  the observed mRNA activity (with some noise, of
                  course). No way is suggested for identification of
                  the regulator with a gene that produces it. That is,
                  a reconstructed network has only regulator-gene
                  connections, and expression of genes is not used to
                  model expression of regulators (this should be
                  correctable).},
}

@article{zhou-etal-04,
  title =	 {A Bayesian connectivity-based approach to
                  constructing probabilistic gene regulatory networks},
  author =	 {X Zhou and X Wang and R Pal and I Ivanov and M
                  Bittner and E Dougherty},
  journal =	 {Bioinformatics},
  paged =	 {2918-2927},
  volume =	 20,
  number =	 17,
  year =	 2004,
  abstract =	 {Motivation: We have hypothesized that the
                  construction of transcriptional regulatory networks
                  using a method that optimizes connectivity would
                  lead to regulation consistent with biological
                  expectations. A key expectation is that the
                  hypothetical networks should produce a few, very
                  strong attractors, highly similar to the original
                  observations, mimicking biological state stability
                  and determinism. Another central expectation is
                  that, since it is expected that the biological
                  control is distributed and mutually reinforcing,
                  interpretation of the observations should lead to a
                  very small number of connection schemes. <p>Results:
                  We propose a fully Bayesian approach to constructing
                  probabilistic gene regulatory networks (PGRNs) that
                  emphasizes network topology. The method computes the
                  possible parent sets of each gene, the corresponding
                  predictors and the associated probabilities based on
                  a nonlinear perceptron model, using a reversible
                  jump Markov chain Monte Carlo (MCMC) technique, and
                  an MCMC method is employed to search the network
                  configurations to find those with the highest
                  Bayesian scores to construct the PGRN. The Bayesian
                  method has been used to construct a PGRN based on
                  the observed behavior of a set of genes whose
                  expression patterns vary across a set of melanoma
                  samples exhibiting two very different phenotypes
                  with respect to cell motility and invasiveness. Key
                  biological features have been faithfully reflected
                  in the model. Its steady-state distribution contains
                  attractors that are either identical or very similar
                  to the states observed in the data, and many of the
                  attractors are singletons, which mimics the
                  biological propensity to stably occupy a given
                  state. Most interestingly, the connectivity rules
                  for the most optimal generated networks constituting
                  the PGRN are remarkably similar, as would be
                  expected for a network operating on a distributed
                  basis, with strong interactions between the
                  components. <p>Availability: The appendix is
                  available at
                  http://gspsnap.tamu. edu/gspweb/pgrn/bayes.html. username:
                  gspweb password: gsplab.},
  pdf =		 {bioinformatics/zhou-etal-04.pdf},
  entered_on =	 {02/02/05},
  comments =	 {As far as I can tell, the regulation is modeled by
                  boolean networks, and measurements of steady states
                  of transcriptional networks are steady states of
                  these bollean ones. There is a probability of a
                  switch between boolean networks at every evolution
                  step, resulting in many possible steady states, and
                  also some noise around them. Each boolean network is
                  then learned, their number is learned, probability
                  of switches are learned, all in Bayesian formalism
                  with appropriate BIC-like complexity
                  control. Examples analyzed involve networks of only
                  10 (!) genes. It is seen that the connectivity
                  patterns of the constituent Boolean nets are not
                  very different from each other, and this is somehow
                  related to robustness of transcriptional control,
                  homeostasis, etc -- don't ask me how.},
}













@article{brown-callan-04,
  title =	 {Evolutionary comparisons suggest many novel cAMP
                  response protein binding sites in Escherichia coli},
  author =	 {C. T. Brown and C. G. Callan, Jr.},
  abstract =	 {The cAMP response protein (CRP) is a transcription
                  factor known to regulate many genes in Escherichia
                  coli. Computational studies of transcription factor
                  binding to DNA are usually based on a simple matrix
                  model of sequence-dependent binding energy. For CRP,
                  this model predicts many binding sites that are not
                  known to be functional. If they are indeed spurious,
                  the underlying binding model is called into
                  question. We use a species comparison method to
                  assess the functionality of a population of such
                  predicted CRP sites in E. coli. We compare them with
                  orthologous sites in Salmonella
                  typhimuriumidentified independently by CLUSTALW
                  alignment, and find a dependence of mutation
                  probability on position in the site. This dependence
                  increases with predicted site binding energy. The
                  positions where mutation is most strongly suppressed
                  are those where mutation would have the biggest
                  effect on predicted binding energy. This finding
                  suggests that many of the novel sites are
                  functional, that the matrix model correctly
                  estimates their binding strength, and that
                  calculated CRP binding strength is the quantity that
                  is conserved between species. The analysis also
                  identifies many new E. coli binding sites and genes
                  likely to be functional for CRP.},
  pdf =		 {bioinformatics/brown-callan-03.pdf},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  year =	 2004,
  volume =	 101,
  number =	 8,
  pages =	 {2404--2409},
  comments =	 {},
  entered_on =	 {05/09/04},
}
@article{mwangi-siggia-03,
  title =	 {Genome wide identification of regulatory motifs in
                  Bacillus subtilis},
  author =	 {Michael Mwangi and Eric Siggia},
  abstract =	 {Background: To explain the vastly different
                  phenotypes exhibited by the same organism under
                  different conditions, it is essential that we
                  understand how the organism's genes are coordinately
                  regulated. While there are many excellent tools for
                  predicting sequences encoding proteins or RNA genes,
                  few algorithms exist to predict regulatory sequences
                  on a genome wide scale with no prior
                  information. <p>Results: To identify motifs involved
                  in the control of transcription, an algorithm was
                  developed that searches upstream of operons for
                  improbably frequent dimers. The algorithm was
                  applied to the B. subtilis genome, which is
                  predicted to encode for approximately 200 DNA
                  binding proteins. The dimers found to be
                  over-represented could be clustered into 317
                  distinct groups, each thought to represent a class
                  of motifs uniquely recognized by some transcription
                  factor. For each cluster of dimers, a representative
                  weight matrix was derived and scored over the
                  regions upstream of the operons to predict the sites
                  recognized by the cluster's factor, and a putative
                  regulon of the operons immediately downstream of the
                  sites was inferred. The distribution in number of
                  operons per predicted regulon is comparable to that
                  for well characterized transcription factors. The
                  most highly over-represented dimers matched
                  \sigma^A, the T-box, and \sigma^W sites. We have
                  evidence to suggest that at least 52 of our clusters
                  of dimers represent actual regulatory motifs, based
                  on the groups' weight matrix matches to
                  experimentally characterized sites, the functional
                  similarity of the component operons of the groups'
                  regulons, and the positional biases of the weight
                  matrix matches. All predictions are assigned a
                  significance value, and thresholds are set to avoid
                  false positives. Where possible, we examine our
                  false negatives, drawing examples from known
                  regulatory motifs and regulons inferred from RNA
                  expression data. Conclusions: We have demonstrated
                  that in the case of B. subtilis our algorithm allows
                  for the genome wide identification of regulatory
                  sites. As well as recovering known sites, we predict
                  new sites of yet uncharacterized factors. Results
                  can be viewed at
                  http://www.physics.rockefeller.edu/~mwangi/.},
  journal =	 {BMC Bioinformatics},
  year =	 2003,
  volume =	 4,
  number =	 18,
  pdf =		 {bioinformatics/mwangi-siggia-03.pdf},
  entered_on =	 {05/05/04},
  comments =	 {Good literature collection on searching for
                  overrepresented motifs. Last paragraph page 2 lists
                  detriments of the algorithm. Search for dimers of
                  length 4-5 with separation of 3-30. Significance of
                  dimers is determined by their occurence relative to
                  independent Poissonian occurence. Repeat of the same
                  nucleotides are ignored. A particular ad hoc metric
                  for comparing to dimers is introduced, and dimers
                  are then clustered according to this metric using
                  some particular clustering algorithm they
                  invented. Determining the optimal number of clusters
                  -- they should read on work of Still and Bialek. The
                  point of the work is that motifs =
                  overrepresentation. This should not be this way. On
                  the contrary, motifs can be underrepresented (that
                  is, evolution cleaned up all sequences except the
                  real motifs), or anything else. One should use
                  conserved sites over many species, or something
                  similar to see real motifs. Or maybe just any
                  statistical fluctuation, not necessarily
                  overrepresentation.},
}

@article{bussemaker-etal-00,
  title =	 {Building a dictionary for genomes: Identification of
                  presumptive regulatory sites by statistical
                  analysis},
  author =	 {H Bussemaker and H Li and E Siggia},
  abstract =	 {The availability of complete genome sequences and
                  mRNA expression data for all genes creates new
                  opportunities and challenges for identifying DNA
                  sequence motifs that control gene expression. An
                  algorithm, ÔÔMobyDick,ŐŐ is presented that
                  decomposes a set of DNA sequences into the most
                  probable dictionary of motifs or words. This method
                  is applicable to any set of DNA sequences: for
                  example, all upstream regions in a genome or all
                  genes expressed under certain
                  conditions. Identification of words is based on a
                  probabilistic segmentation model in which the
                  significance of longer words is deduced from the
                  frequency of shorter ones of various lengths,
                  eliminating the need for a separate set of reference
                  data to define probabilities. We have built a
                  dictionary with 1,200 words for the 6,000
                  upstreamregulatory regions in the yeast genome; the
                  500 most significant words (some with as few as 10
                  copies in all of the upstream regions) match 114 of
                  443 experimentally determined sites (a significance
                  level of 18 standard deviations). When analyzing all
                  of the genes up-regulated during sporulation as a
                  group, we find many motifs in addition to the few
                  previously identified by analyzing the subclusters
                  individually to the expression subclusters. Applying
                  MobyDick to the genes derepressed when the general
                  repressor Tup1 is deleted, we find known as well as
                  putative binding sites for its regulatory partners.},
  pages =	 {10096-10100},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  year =	 2000,
  volume =	 97,
  number =	 18,
  entered_on =	 {08/01/05},
  pdf =		 {bioinformatics/bussemaker-li-siggia-00.pdf},
}


@Article{bussemaker-li-siggia-01,
  author =	 {H. Bussemaker and E. Siggia and H. Li},
  title =	 {Regulatory element detection using correlation with
                  expression},
  journal =	 {Nature Genetics},
  year =	 {2001},
  volume =	 {27},
  pages =	 {167--171},
  entered_on =	 {05/05/04},
  pdf =		 {bioinformatics/bussemaker-li-siggia-01.pdf},
  abstract =	 {We present here a new computational method for
                  discovering cis-regulatory elements that circumvents
                  the need to cluster genes based on their expression
                  profiles. Based on a model in which upstream motifs
                  contribute additively to the log-expression level of
                  a gene, this method requires a single genome-wide
                  set of expression ratios and the upstream sequence
                  for each gene, and outputs statistically significant
                  motifs. Analysis of publicly available expression
                  data for Saccharomyces cerevisiaereveals several new
                  putative regulatory elements, some of which
                  plausibly control the early, transient induction of
                  genes during sporulation. Known motifs generally
                  have high statistical significance.},
  comments =	 {The REDUCE paper. Biggest problem -- additive
                  regulation assumed. Fitting to data is done by
                  linear regression as well.},
}

@misc{eisenberg-levanon-03,
  author =	 {Eli Eisenberg and Erez Levanon},
  title =	 {Human housekeeping genes are compact},
  abstract =	 {We identify a set of 575 human genes that are
                  expressed in all conditions tested in a publicly
                  available database of microarray results. Based on
                  this common occurrence, the set is expected to be
                  rich in "housekeeping" genes, showing constitutive
                  expression in all tissues. We compare selected
                  aspects of their genomic structure with a set of
                  background genes. We find that the introns,
                  untranslated regions and coding sequences of the
                  housekeeping genes are shorter, indicating a
                  selection forcompactness in these genes. },
  url =		 {http://arxiv.org/abs/q-bio/0309020},
  pdf =		 {bioinformatics/eisenberg-levanon-03.pdf},
  entered_on =	 {04/13/04},
  comments =	 {The discussion regarding how transcription is costly
                  and, therefore, commonly expressed genes (like
                  housekeeping genes) should have short sequences is
                  interesting, but I find this principle hard to
                  believe: first, you need to make the aminoacid
                  sequence that will be able to do the job, and only
                  then may you care about the length of such
                  sequence.},
  howpublished = {arXiv: q-bio/0309020},
}

@misc{apostol-etal-03,
  title =	 {How Predictable Are Biological Sequences?},
  author =	 {Izydor Apostol and Philippe Jacquet and Wojciech
                  Szpankowski},
  year =	 2003,
  postscript =	 {bioinformatics/apostol-etal-03.ps.gz},
  howpublished = {European Conf. Comput. Biol., 2003},
  entered_on =	 {04/13/04},
  comments =	 {Application of the "Sampled Pattern Matching" (SPM)
                  prediction algorithm to biological data. The
                  algorithm is based upon building the pdf of the next
                  symbol conditional on the suffix of the length which
                  is a finite fraction of the longest suffix that has
                  appeared, at least, twice in the preceeding
                  sequence. The thorem that is alluded to in the text
                  is based on the Markovian assumption about the
                  source, which is not necessarily a biologically
                  relevant assumption. The paper mentiones that the
                  predictability, as measured by the error of the SPM
                  algorithm is different for conding vs. non-coding
                  regions in DNA, which is a nice, but not a novel
                  result.}
}

@inproceedings{lonardi-szpankowski-03,
  title =	 {Joint Source--Channel LZ'77 Coding},
  author =	 {Stefano Lonardi and Wojciech Szpankowski},
  booktitle =	 {Proc. Data Compr. Conf. 2003},
  year =	 2003,
  publisher =	 {IEEE Computer Society Press},
  pdf =		 {entropy/lonardi-szpankowski-03.pdf},
  entered_on =	 {04/13/04},
  abstract =	 {Limited memory and bounded communication resources
                  require powerful data compression techniques, but at
                  the same time noise tetherless channels and/or
                  corrupted file systems need error correction
                  capabilities. Joint source-channel coding has
                  emerged as a viable solution to this problem. We
                  present here the first practical joint
                  source-channel coding algorithm capable of
                  correcting errors in the popular Lempel-Ziv'77
                  scheme without practically loosing any compression
                  power. This is possible since the LZ'77 encoder does
                  not completely remove all redundancy. The inherent
                  additional redundancy left by the LZ'77 encoder is
                  used succinctly by a channel coder (e.g.,
                  Reed-Solomon coder) to protect against limited
                  number of errors. In addition to this, the scheme
                  proposed here is perfectly backward0compatible, that
                  is, a file compressed with our error resilient LZ-77
                  can be still decompressed by a common LZ'77
                  decoder. In this preliminary report, we present our
                  algorithm, collect some experimental data supporting
                  our claims, and provide some thoretical
                  justifications.},
  comments =	 {The error correction is based upon the fact that
                  the longest prefix in LZ'77 code usually appears
                  more than once in the preceeding parsed text. One
                  usually chooses a random (first) of this repeats and
                  points to it. Instead, one may point at the repeat
                  with number coded by the bits of some text sequence,
                  which may be, for example, the parity sequence for
                  the original text, etc. For Markov processes it's
                  proven that there will be enough of such redundancy
                  to be able to do error correction. Some cute
                  numerical experiments.}
}

@techreport{barash-friedman-02,
  institution =	 {Hebrew University, CS, Leibnitz center},
  number =	 {2002-05},
  author =	 {Yoseph Barash and Nir Friedman},
  title =	 {Context-Specific Bayesian Clustering for Gene
                  Expression Data},
  year =	 2002,
  url =
                  {http://leibniz.cs.huji.ac.il/tr/acc/2002/HUJI-CSE-LTR-2002-5_BF1Full.pdf},
  pdf =		 {bioinformatics/barash-friedman-02.pdf},
  abstract =	 {The recent growth in genomic data and measurements
                  of genome-wide expression patterns allows us to
                  apply computational tools to examine gene regulation
                  by transcription factors. In this work, we present a
                  class of mathematical models that help in
                  understanding the connections between transcription
                  factors and functional classes of genes based on
                  genetic and genomic data. Such a model represents
                  the joint distribution of transcription factor
                  binding sites and of expression levels of a gene in
                  a unified probabilistic model. Learning a combined
                  probability model of binding sites and expression
                  patterns enables us to improve the clustering of the
                  genes based on the discovery of putative binding
                  sites and to detect which binding sites and
                  experiments best characterize a cluster. To learn
                  such models from data, we introduce a new search
                  method that rapidly learns a model according to a
                  Bayesian score. We evaluate our method on synthetic
                  data as well as on real life data and analyze the
                  biological insights it provides. Finally, we
                  demonstrate the applicability of the method to other
                  data analysis problems in gene expression data. },
  entered_on =	 {02/20/04},
  comments =	 {A good primer on the method, superceedes previous
                  work by the authors on the subject ("default tables"
                  for description of contexts, structural "EM"
                  algorithm). I cannot agree with their claim that:
                  The main biological hypothesis underlying most of
                  these analyses is "Genes with a common functional
                  role have similar expression patterns across
                  different experiments." Good reference
                  section. Here's the summary of what they do: "Our
                  method clusters genes with similar expression
                  patterns and promoter regions . In addition, the
                  learned model provides insight on the regulation of
                  genes within each cluster. The key features of our
                  approach are: (1) automatic detection of the number
                  of clusters; (2) automatic detection of random
                  variables that are irrelevant to the clusters; (3)
                  robust clustering in the presence of many such
                  random variables, (4) context-depended
                  representation that describes which clusters each
                  attribute depends on."},
}

@TechReport{murhy-mian-99,
  author =	 {Kevin Murphy and Saira Mian},
  title =	 {Modelling Gene Expression Data using Dynamic
                  Bayesian Networks},
  institution =	 {Berkeley, CS Dept.},
  year =	 {1999},
  entered_n =	 {11/04/03},
  pdf =		 {bioinformatics/murphy-mian-99.pdf},
  abstract =	 {Recently, there has been much interest in reverse
                  engineering genetic networks from time series
                  data. In this paper, we show that most of the
                  proposed discrete time models -- including the
                  boolean network model [Kau93, SS96], the linear
                  model of Dohaeseleer et al. [DWFS99], and the
                  nonlinear model of Weaver et al. [WWS99] -- are all
                  special cases of a general class of models called
                  Dynamic Bayesian Networks (DBNs). The advantages of
                  DBNs include the ability to model stochasticity, to
                  incorporate prior knowledge, and to handle hidden
                  variables and missing data in a principled way. This
                  paper provides a review of techniques for learning
                  DBNs. Keywords: Genetic networks, boolean networks,
                  Bayesian networks, neural networks, reverse
                  engineering, machine learning.},
  comments =	 {Simple non-technical description of BN's and
                  DBN's. Nothing in terms of new results.},
}

@misc{nimwegen-03,
  howpublished = {E-print},
  title =	 {Scaling laws in the functional content of genomes},
  author =	 {Erik van Nimwegen},
  year =	 2003,
  abstract =	 {With the number of sequenced genomes now over one
                  hundred, and the availability of rough functional
                  annotations for a substantial proportion of their
                  genes, it has become possible to study the
                  statistics of gene content across genomes. Here I
                  show that, for many high-level functional
                  categories, the number of genes in the category
                  scales as a power-law in the total number of genes
                  in the genome. The occurrence of such scaling laws
                  can be explained with a simple theoretical model,
                  and this model suggests that the exponents of the
                  observed scaling laws correspond to universal
                  constants of the evolutionary process. I discuss
                  some consequences of these scaling laws for our
                  understanding of organism design. },
  url =		 {http://arxiv.org/abs/physics/0307001},
  pdf =		 {bioinformatics/nimwegen-03.pdf},
  entered_on =	 {09/18/03},
  comments =	 {The paper reviews scalings of the number of genes in
                  a given functional units versus the total number of
                  genes in the organism, as given by some (standard)
                  databases. The dependence is, apparently, a power
                  law for many different classes of genes and
                  different types of animals. There are, obviously,
                  many things that can go wrong with such analysis (is
                  it really a power law? what's the dependence on the
                  database? on assignment to functional units? etc.),
                  but Erik seems to address most of such questions
                  very reasonably. The simple model proposed to
                  explain the results is probably too simplistic.},
}

@article{ebeling-jimenezmontano-80,
  author =	 {W. Ebeling and M. A. Jimenez-Montano},
  title =	 {On grammars, complexity and information measures of
                  biological macromolecules},
  journal =	 {Math. Biosci.},
  volume =	 52,
  year =	 1980,
  pages =	 {53--71},
  keywords =	 {TO_GET},
  comments =	 {Should contain some references to the
                  Thiele-Schreidereiter grammar complexity
                  compression. Also see
                  \cite{jimenezmontano-etal-02,grassberger-02} for
                  subsequent developments.},
}

@article{grosse-etal-00,
  title =	 {Species Independence of Mutual Information in Coding
                  and Noncoding DNA},
  author =	 {Ivo Grosse and Hanspeter Herzel and Sergey
                  V. Buldyrev and H. Eugene Stanley},
  journal =	 {Physical Review E},
  volume =	 61,
  number =	 5,
  pages =	 {5624--5629},
  month =	 {May},
  year =	 2000,
  abstract =	 {We explore if there exist universal statistical
                  patterns that are different in coding and noncoding
                  DNA and can be found in all living organisms,
                  regardless of their phylogenetic origin. We find
                  that (i) the mutual information function has a
                  significantly different functional form in coding
                  and noncoding DNA. We further find that (ii) the
                  probability distributions of the average mutual
                  information I are significantly different in coding
                  and noncoding DNA, while (iii) they are almost the
                  same for organisms of all taxonomic
                  classes. Surprisingly, we find that I is capable of
                  predicting coding regions as accurately as
                  organism-specific coding measures.},
  keywords =	 {TO_GET},
  entered_on =	 {08/21/2001}
} 

@article{herzel-etal-98,
  author =	 {H. Herzel and E. N. Trifonov and O. Weiss and
                  I. Grosse},
  title =	 {Interpreting Correlations in Biosequences},
  journal =	 {Physica A},
  volume =	 249,
  pages =	 {449--459},
  year =	 1998,
  entered_on =	 {08/21/2001},
  keywords =	 {TO_GET},
} 

@article{herzel-grosse-95,
  author =	 {H. Herzel and I. Grosse},
  title =	 {Measuring correlations in symbol sequences},
  journal =	 {Physica A},
  volume =	 216,
  number =	 4,
  pages =	 {518--542},
  month =	 {July},
  year =	 1995,
  abstract =	 {The paper is devoted to relations between
                  correlation functions and mutual information, it is
                  shown that, in the sequences over an alphabet of
                  lambda symbols, statistical dependences are measured
                  by (lambda-1)/2 independent parameters. However, not
                  all of them can be determined by autocorrelation
                  functions. Appropriate sets of correlation functions
                  (including crosscorrelations) are introduced, which
                  allow the detection of all dependences. The results
                  are exemplified fir binary, ternary, and quaternary
                  symbol sequences. As an application, it is discussed
                  that a nonuniform codon usage in protein-coding DNA
                  sequences introduces periodic correlations even at
                  distances in the order of 1000 base pairs},
  keywords =	 {TO_GET, should have a xerox},
} 

@article{herzel-grosse-97,
  author =	 {H. Herzel and I. Grosse},
  title =	 {Correlations in DNA Sequences - the Role of Protein
                  Coding Segments},
  journal =	 {Phys. Rev. E},
  volume =	 55,
  pages =	 {800-810},
  year =	 1997,
  entered_on =	 {08/21/2001},
  keywords =	 {TO_READ},
  url =		 {bioinformatics/herzel_grosse_97.pdf},
}

@article{li-kaneko-92,
  author =	 {W. Li and K. Kaneko},
  title =	 {Long-range correlations and partial 1/f spectrum in
                  noncoding DNA sequence},
  journal =	 {Europhys. Lett.},
  volume =	 17,
  year =	 1992,
  pages =	 {655--661},
  keywords =	 {TO_GET},
}

@article{schmitt-herzel-97-jtb,
  author =	 {A. O. Schmitt and H. Herzel},
  title =	 {Estimating the Entropy of DNA Sequences},
  journal =	 {J. theor. Biol.},
  volume =	 188,
  pages =	 {369--377},
  year =	 1997,
  entered_on =	 {08/21/2001},
  keywords =	 {TO_GET},
}

@article{stanley-etal-94,
  author =	 {H.E. Stanley and others},
  title =	 {Statistical mechanics in biology: how ubiquitous are
                  long-range correlations?},
  journal =	 {Physica A},
  volume =	 205,
  year =	 1994,
  pages =	 {214--241},
  comments =	 {TO_GET}
}

@article{weiss-herzel-98,
  author =	 {O. Weiss and H. Herzel},
  title =	 {Measuring Correlations in Protein Sequences},
  journal =	 {Z. Phys. Chem.},
  volume =	 204,
  pages =	 {183-197},
  year =	 1998,
  entered_on =	 {08/21/2001},
  keywords =	 {TO_GET},
}

@article{weiss-herzel-98-jtb,
  author =	 {O. Weiss and H. Herzel},
  title =	 {Correlations in Protein Sequences and Property
                  Codes},
  journal =	 {J. theor. Biol.},
  volume =	 190,
  pages =	 {341--353},
  year =	 1998,
  entered_on =	 {08/21/2001},
  keywords =	 {TO_GET},
}

@article{weiss-jimenezmontano-herzel-00,
  author =	 {O. Weiss and M. Jimenez-Montano and H. Herzel},
  title =	 {Information content of protein sequences},
  journal =	 {J. theor. Biol.},
  volume =	 206,
  pages =	 {379--386},
  year =	 2000,
  entered_on =	 {08/21/2001},
  keywords =	 {TO_GET},
}@article{drew-etal-05,
  title  =       {Temporal Control of Conditioned Responding in Goldfish},
  author =       {M Drew and B Zupan and A Cooke and P Couvillon and P
                  Balsam},
  abstract =     {The peak procedure was used to characterize response
                  timing during acquisition and maintenance of 
                  conditioned responding in goldfish. Subjects
                  received light-shock pairings with a 5- or 15-s
                  interstimulus interval. On interspersed peak trials,
                  the conditioned stimulus light was presented for 45
                  s and no shock was delivered. Peaks in the
                  conditioned response, general activity, occurred at
                  about the time of the expected unconditioned
                  stimulus, and variability in the activity
                  distribution was scalar. Modeling of the changes in
                  the activity distributions over sessions revealed
                  that the temporal features of the conditioned
                  response changed very little during acquisition. The
                  data suggest that times are learned early in
                  training,  and, contrary to I. P. Pavlovâ€™s
                  (1927/1960) concept of â€śinhibition of delay,â€ť that
                  timing is learning when to respond rather than
                  learning when not to respond. }, 
  journal =      {J Exper. Psychology: Animal Behav. Proc.},
  year =         2005,
  volume =       31, 
  number =       1,
  pages =        {31â€“39},
  pdf =          {bio-learning/drew-etal-05.pdf},
  comments =     {Nice set of references to timing of the CR after the
                  onset of the CS in vertebrates; this specific paper
                  examines the goldfish.},
}

@article{balsam-gibbon-88,
  author =	 {P Balsam and J Gibbon}, 
  year =         1988,
  title =        {Formation of toneâ€“US associations 
                  does not interfere with the formation of contextâ€“US
                  associations in pigeons},
  journal =      {J. Experim. Psychology: Animal Behav. Proc.},
  volume =       14,
  pages =        {401â€“-412},
  pdf =          {bio-learning/balsam-gibbon-88.pdf},
  abstract =     {In four experiments we investigated whether signaled
                  and unsignaled US presentations resulted in
                  differential context conditioning. Experiments 1 and
                  2 showed that the presence of a tone during grain
                  presentation facilitated the formation of tone-food
                  associations in pigeons. Experiment 2 also showed
                  that the acquisition of associative value by the
                  tone did not diminish associations between context
                  and the unconditioned stimulus (US). Experiment 3
                  showed that signaled USs did not interfere with the
                  acquisition of context-US associations, and
                  Experiment 4 showed that even when the signal was
                  extensively pretrained, context-US associations
                  could not be blocked. The results of these
                  experiments are inconsistent with conditioning
                  models that require competition between cues and
                  contexts for associative value.},
  comments =     {Comparison of association to background and to
                  CS. Are they independent? Can backgroundn be
                  overshadowed? Is background just another CS?
                  Comparison to Scalar Expectancy theory and to
                  Rescorla-Wagner. Acquisition is dramatically faster
                  if the context is extinguished prior to
                  autoshaping. Need to read more thoroughly.}
}


@article{grossberg-82,
  author =       {S Grossberg},
  year =         1982,
  title =        {Processing of expected and unexpected events during 
                  conditioning and attention: A psychophysiological
                  theory},
  journal =      {Psychological Rev.}, 
  volume =       89,
  pages =        {529â€“572},
  comments =     {Studies competition between different CS's to
                  predict a US.},
}

@incollection{jenkins-etal-81,
  author =       {H Jenkins and R Barnes and F Barrera},
  year =         1981,
  title =        {Why autoshaping depends on trial spacing},
  editor =       {C Locurto and H Terrace and J Gibbon},
  booktitle =    {Autoshaping and conditioning theory},
  pages =        {255â€“284},
  address =      {New York},
  publisher =    {Academic Press},
  comments =     {Fast (about 30 rewards) acquisition time for the
                  CR.}
}


@article{gallistel-gibbon-00,
  author =	 {CR Gallistel and J Gibbon},
  year =	 2000,
  title =	 {Time, rate and conditioning},
  journal =	 {Psychological Rev.},
  volume =	 107,
  pages =	 {289--344},
  entered_on =   {01/03/06},
  pdf =          {bio-learning/gallistel-gibbon-00.pdf},
  abstract =     {We draw together and develop previous timing models
                  for a broad range of conditioning phenomena to
                  reveal their common conceptual foundations: First,
                  conditioning depends on the learning of the temporal
                  intervals between events and the reciprocals of
                  these intervals, the rates of event
                  occurences. Second, remembered intervals and rates
                  translate into observed behavior through decision
                  processes whose structure is adapted to noise in the
                  decision variables. The noise and the uncertainties
                  consequent upon it have both subjective and
                  objective origins. A third feature of these models
                  is their time-scale invariance, which we argue is a
                  deeply important property evident in the available
                  experimental data. This conceptual framework is
                  similar ro rhe psychophysical conceptual framework
                  in which contemporary models of sensory processinf
                  are rooted. We contrast it with the associative
                  conceptual framework.},
  comments =      {Review of the scalar expectancy theory and the rate
                  estimation theory. Some interesting points (see also
                  \cite{kakade-dayan-02} for summary of some): time to
                  acquisition of CR depends on the number of
                  reinforcers and does not depend on the fraction of
                  reinforced CS. Note that RET for qcquisition of
                  conditioned response would suggest excessively large
                  thresholds \beta, as argued in
                  \cite{kakade-dayan-02}. Review of all major
                  conditioning paradigms.},
}


@incollection{dayan-01,
  author =	 {P Dayan},
  year =         2001,
  title =        {Reinforcement learning},
  editor =       {CR Gallistel},
  booktitle =        {Steven's Handbook of Experimental Psychology},
  address =      {New York, NY},
  publisher =    {Wiley},
  pdf =          {bio-learning/dayan-01.pdf},
}



@inproceedings{dayan-long-98,
  title =	 {Statistical models of conditioning},
  author =	 {P Dayan and T Long},
  year =	 1998,
  abstract =	 {Conditioning experiments probe the ways that animals
                  make predictions about rewards and punishments and
                  use those predictions to control their behavior. One
                  standard model of conditioning paradigms which
                  involve many conditioned stimuli suggests that
                  individual predictions should be added
                  together. Various key results show that this model
                  fails in some circumstances, and motivate an
                  alternative model, in which there is attentional
                  selection between different available stimuli. The
                  new model is a form of mixture of experts, has a
                  close relationship with some other existing
                  psychological suggestions, and is statistically
                  well-founded. },
  booktitle =	 {Adv. Neural Inf. Proc. Syst. 10},
  publisher =	 {MIT Press},
  pdf=           {bio-learning/dayan-long-98.pdf},
  comments =     {The paper has all the usual problems of the paper
                  where the exprerimentalist does not agree with the
                  animal on the priors. They review different
                  conditioning experiments, in particularly focusing
                  on the downwards unblocking, which seems to be
                  unexplainable by the standard
                  \cite{rescorla-wagner-72} US-processing theory, that is,
                  prediction-discrepancy-reinforcement type of
                  learning modeling. They present a different model
                  where each of the CSs acts as an independent expert
                  in predicting a US, and the cooperative mixture of
                  experts is used by the animals to make its
                  predictions. References I need to read at Konorski,
                  1967, Grossberg, 1982, Solomon and Corbit 1974
                  ("opponency").  }
}

@article{kakade-dayan-02,
  title =	 {Acquisition and Extinction in Autoshaping},
  author =	 {Sham Kakade and Peter Dayan},
  abstract =	 {C. R. Gallistel and J. Gibbon (2000) presented
                  quantitative data on the speed with which animals
                  acquire behavioral responses during autoshaping,
                  together with a statistical model of learning
                  intended to account for them. Although this model
                  captures the form of the dependencies among critical
                  variables, its detailed predictions are
                  substantially at variance with the data. In the
                  present article, further key data on the speed of
                  acquisition are used to motivate an alternative
                  model of learning, in which animals can be
                  interpreted as paying different amounts of attention
                  to stimuli according to estimates of their
                  differential reliabilities as predictors.},
  pdf =		 {bio-learning/kakade-dayan-02.pdf},
  journal =	 {Psych. Rev.},
  year =	 2002,
  volume =	 109,
  number =	 3,
  pages =	 {533--544},

  comments =	 {Starts with references to earlier papers by Gibbons,
                  Gallistel, Balsam, etc. Reviews data that time to CR
                  acquisition is proportional to CS duration (that is,
                  inversely proportional to US rate during CS
                  presence), and inverseley proportional to the
                  intertrial distance (that is, proportional to the
                  conservative, Laplace, estimate of the background US
                  rate). Further, the number of US presentations (not
                  of trial in partial reinforcement scenarios)
                  determines time to acquisition. They show, however,
                  that simple rate estimation produces inconsistencies
                  with data, since the learning seems to be much
                  slower in the zero background rate case when the
                  background US rate is present (fig 2). Further (fig
                  3 and related discsussion) there is a problem of
                  attributing the USs to the background or to the
                  CS's, especially if some prior exposure to US has
                  been arranged (without a CS). In this case, the
                  background rate should depend on the delivery rate
                  of US in prior non-CS regions, an so should the time
                  to acquisition. They do not; and the dependence is
                  only on the number of prior non-CS related USs. The
                  authors introduce the ``windowing'' model for
                  estimation of the background US rate and the US rate
                  during the CS. Of course, then one must agree with
                  the animal how the window length should be set; they
                  do not really address this issue (a more general
                  Kalman-filter-like model introduced in the Appendix
                  suffers from the same problem). Then each expert
                  (CS, background, etc.) has its own estimate of the
                  US rate, and the overall animal's expectation is a
                  weighted sum of these predictions, with the weights
                  proportional to the reliability of each individual
                  expert; the experts may thus, for example, block
                  each other. No law for the evolution of
                  reliabilities is given. With some empirical laws, it
                  might be possible to resolve the problem of varying
                  acquisition rates (see above). But is this really an
                  improvement -- we have produced a better fit by
                  merely introducing yet another free, undetermined,
                  function -- the reliability. In general, the article
                  focuses on testing certain laws, algorithms for
                  modeling animal behavior instead of studying the
                  problem that is being solved by the animal in some
                  general, method-independent, way. It's difficult to
                  model learning when you don't know what the animal's
                  priors are.},
}

@inproceedings{kakade-dayan-00,
  booktitle =	 {Adv. Neural. Inf. Proc. Syst. (NIPS) 12},
  year =	 2000,
  author =	 {Sham Kakade and Peter Dayan},
  title =	 {Acquisition in autoshaping},
  abstract =	 {Quantitative data on the speed with which animals
                  acquire behavioral responses during classical
                  conditioning experiments should provide strong
                  constraints on models of learning. However, most
                  models have simply ignored these data; the few that
                  have attempted to address them have failed by at
                  least an order of magnitude. We discuss key data on
                  the speed of acquisition, and show howto account for
                  themusing a statistically sound model of learning,
                  in which differential reliabilities of stimuli play
                  a crucial role.},
  pdf =		 {bio-learning/kakade-dayan-00.pdf},
  comments =	 {Preliminary version of \cite{kakade-dayan-02}.},
}
@incollection{bialek-02,
  booktitle =	 {Physics of bio-molecules and cells: Les Houches,
                  Session LXXV, 2-27 July 2001},
  editor =	 {H Flyvbjerg and F Julicher and P Ormos and F David},
  publisher =	 {EDP Sciences, Springer},
  address =	 {Les Ulis, Berlin},
  year =	 2002,
  author =	 {W Bialek},
  title =	 {Thinking about the brain},
  pages =	 {486--577},
  pdf =		 {bio-learning/bialek-02.pdf},
  entered_on =	 {07/07/05},
  abstract =	 {We all are fascinated by the phenomena of
                  intelligent behavior, as generated both by our own
                  brains and by the brains of other animals. As
                  physicists we would like to understand if there are
                  some general principles that govern the structure
                  and dynamics of the neural circuits that underlie
                  these phenomena. At the molecular level there is an
                  extraordinary universality, but these mechanisms are
                  surprisingly complex. This raises the question of
                  how the brain selects from these diverse mechanisms
                  and adapts to compute "the right thing" in each
                  context. One approach is to ask what problems the
                  brain really solves. There are several examples -
                  from the ability of the visual system to count
                  photons on a dark night to our gestalt recognition
                  of statistical tendencies toward symmetry in random
                  patterns - where the performance of the system in
                  fact approaches some fundamental physical or
                  statistical limits. This suggests that some sort of
                  optimization principles may be at work, and there
                  are examples where these principles have been
                  formulated clearly and generated predictions which
                  are confirmed in new experiments; a central theme in
                  this work is the matching of the coding and
                  computational strategies of the brain to the
                  statistical structure of the world around
                  us. Extension of these principles to the problem of
                  learning leads us into interesting theoretical
                  questions about how to measure the complexity of the
                  data from which we learn and the complexity of the
                  models that we use in learning, as well as opening
                  some new opportunities for experiment. This
                  combination of theoretical and experimental work
                  gives us some new (if still speculative)
                  perspectives on classical problems and controversies
                  in cognition.},
  url =		 {http://arxiv.org/abs/physics/0205030},
}


@article{gallistel-etal-04,
  title =	 {The learning curve: Implications of a quantitative
                  analysis},
  author =	 {CR Gallistel and S Fairhurst and P Balsam},
  abstract =	 {The negatively accelerated, gradually increasing
                  learning curve is an artifact of group averaging in
                  several commonly used basic learning paradigms
                  (pigeon autoshaping, delay- and trace-eyeblink
                  conditioning in the rabbit and rat, autoshaped
                  hopper entry in the rat, plus maze performance in
                  the rat, and water maze performance in the
                  mouse). The learning curves for individual subjects
                  show an abrupt, often step-like increase from the
                  untrained level of responding to the level seen in
                  the well trained subject. The rise is at least as
                  abrupt as that commonly seen in psychometric
                  functions in stimulus detection experiments. It may
                  indicate that the appearance of conditioned behavior
                  is mediated by an evidence-based decision process,
                  as in stimulus detection experiments. If the
                  appearance of conditioned behavior is taken instead
                  to reflect the increase in an underlying associative
                  strength, then a negligible portion of the function
                  relating associative strength to amount of
                  experience is behaviorally visible. Consequently,
                  rate of learning cannot be estimated from the
                  group-average curve; the best measure is latency to
                  the onset of responding, determined for each subject
                  individually.},
  journal =	 {Proc. Natl. Acad. Sci. (USA)},
  year =	 2004,
  volume =	 101,
  number =	 36,
  pages =	 {13124--13131},
  pdf =		 {bio-learning/gallistel-etal-04.pdf},
  entered_on =	 {07/07/05},
  comments =	 {The paper reviews a series of various conditioning
                  experiments and shows that in all cases learning is
                  almost instantaneous on a single individual level
                  (while smooth, if averaged over populations). This
                  resembles many effects in other fields of
                  literature, such as \cite{cluzel-etal-00}. Of
                  course, in this work the authors talk about
                  acquisition of conditioning behavior (that is,
                  making the animals understand that there is a very
                  strong CS-US association), rather then learning a
                  value of the association from some set. It's unclear
                  what the priors are in this case, thus it is unclear
                  how should learning proceed. Further, behavior is
                  measured by some summary statistics. While this
                  statistics changes abruptly, other aspects may be
                  evolving slower. Thirdly, in psychophysics signal
                  detection experiments, one notices that decisions
                  come abruptly if you force them to. Allowing for
                  probabilistic choice even in detection tasks makes
                  learning smooth (no "rounding" effect).},
}

@article{knudsen-02,
  author =	 {E Knudsen},
  year =	 2002,
  journal =	 {Nature},
  volume =	 417,
  pages =	 {328--328},
  title =	 {Instructed learning in the auditory localization
                  pathway of the barn owl},
}


@article{miller-56,
  title =	 {The Magical Number Seven, Plus or Minus Two: Some
                  Limits on Our Capacity for Processing Information},
  author =	 {George A. Miller},
  journal =	 {Psych. Rev.},
  year =	 1956,
  volume =	 63,
  pages =	 {81--97},
  pdf =		 {bio-learning/miller-56.pdf},
  comments =	 {Classical paper suggesting that in many conditions
                  people can discriminate about 7 possibilities only.},
  entered_on =	 {06/29/05},
}

@article{tversky-kahneman-81,
  journal =	 {Science},
  year =	 1981,
  volume =	 211,
  pages =	 {453-458},
  author =	 {A Tversky and D Kahneman},
  title =	 {The framing of decisions and the psychology of
                  choice},
  abstract =	 {The psychological principles that govern the
                  perception of decision problems and the evaluation
                  of probabilities and outcomes produce predictable
                  shifts of preference when the same problem is framed
                  in different ways. Reversals of preference are
                  demonstrated in choices regarding monetary outcomes,
                  both hypothetical and real, and in questions
                  pertaining to the loss of human lives. The effects
                  of frames on preferences are compared to the effects
                  of perspectives on perceptual appearance. The
                  dependence of preferences on the formulation of
                  decision problems is a significant concern for the
                  theory of rational choice.},
  entered_on =	 {06/30/05},
  pdf =		 {bio-learning/tversky-kahneman-81.pdf},
  comments =	 {A classic paper about "irrational" decision making
                  by humans. They propose the "prospects" theory
                  instead of the "utility" theory. The hallmarks of
                  the theory are: larger (negative) utility for loss
                  than the (positive) utility for similar
                  gain. Convexity of the utility (the value of epsilon
                  on top of a large number is less than the value of
                  epsilon alone). Additionally, when averaging
                  potential income, people weigh small probabilities
                  disproportionally highly (could it be because small
                  probabilities come with respectively larger error
                  bars?). With such nonlinear cost and weighting, it
                  becomes important, which level is chosen as the
                  status-quo (zero loss/gain), and whether multiple
                  choices are considered independent, or as a part of
                  a one big decisions.},
}


@article{rescorla-00,
  title =	 {Associative changes with a random CS-US
                  relationship},
  author =	 {R Rescorla},
  abstract =	 {Three experiments examined conditioned magazine
                  approach in rats when a positive unconditioned
                  stimulus (US) bore a random relation to a
                  conditioned stimulus (CS). Experiment 1 found that
                  over the course of conditioning the CS initially
                  elevated responding relative to the baseline but
                  then lost the power to do so. Transfer tests
                  revealed that a CS-US association developed early
                  and persisted despite the decline in magazine
                  responding. Experiment 2 confirmed the persistence
                  of CS-US associations and found them to be more
                  substantial when a different US occurred during the
                  CS than in its absence. In Experiment 3, when the
                  situation was exposed to US alone presentations
                  prior to introducing the CS, there was little
                  evidence that a subsequent random relation between
                  the CS and US produced an association between
                  them. These results agree with those of blocking and
                  overshadowing experiments using discrete CSs and
                  support an interpretation of the random procedure in
                  terms of competition between the background and CS
                  for conditioning.},
  journal =	 {The Quarterly J. Exp. Psych.},
  year =	 2000,
  volume =	 {53B},
  number =	 4,
  pages =	 {325-340},
  pdf =		 {bio-learning/rescorla-2000.pdf},
  comments =	 {The experiments have quite complicated
                  contingencies. I would not be able to follow them;
                  it's suprizing that the rats do what they do. Notice
                  that the rat at the start of the experiment are
                  quite naive. Thus increases of responses to CS which
                  is only randomly related to US might be due to the
                  rat just figuring out the relation, not knowing
                  precisely what to expect, and clearly seeing that
                  the food is more plentiful when the CS is around
                  compared to all the time during the day when it is
                  not in the experiment. I think a trained rat at the
                  beginning of the experiment might be better aswe
                  know that it is not learning about the environment,
                  about statistics of the food delivery, and all that
                  kind of crap, but only about CS-US
                  association. Further, the paper makes a clear
                  distinction between the acquisition of the behavior
                  and extinction of the behavior. The transfer method
                  for measuring associations is cute. However, it
                  measures something about the animal's behavior
                  (which involves animal's desires, moods, etc.) which
                  is not the same as measuring that the animal has
                  learned the association. We need to look for changes
                  in the behavior (any changes!) as a function of time
                  to analyze the latter. There are also some possible
                  problems with the experiment not being truly random
                  (top of page 329).}
}

@article{fusi-etal-05,
  author =	 {S Fusi and P Drew and L Abbott},
  journal =	 {Neuron},
  volume =	 45,
  pages =	 {599-611},
  year =	 2005,
  title =	 {Cascade Models of Synaptically Stored Memories},
  abstract =	 {Storing memories of ongoing, everyday experiences
                  requires a high degree of plasticity, but retaining
                  these memories demands protection against changes
                  induced by further activity and experience. Models
                  in which memories are stored through switch-like
                  transitions in synaptic efficacy are good at storing
                  but are bad at retaining memories if these
                  transitions are likely, and they are poor at storage
                  but good at retention if they are unlikely. We
                  construct and study a model in which each synapse
                  has a cascade of states with different levels of
                  plasticity, connected by metaplastic
                  transitions. This cascade model combines high levels
                  of memory storage with long retention times and
                  significantly outperforms alternative models. As a
                  result, we suggest that memory storage requires
                  synapses with multiple states exhibiting dynamics
                  over a wide range of timescales, and we suggest
                  experimental tests of this hypothesis.},
  pdf =		 {bio-learning/fusi-etal-05.pdf},
  comments =	 {An explicit model that generates power law
                  forgetting from exponential transitions with rates
                  spread over many orders of magnitude. Interesting,
                  but to some extent pretty obvious a posteriori.},
}

@article{earn-johnstone-97,
  title =	 {A systematic error in tests of ideal free theory},
  author =	 {D Earn and R Johnstone},
  abstract =	 {Classical ideal free theory predicts that the
                  distribution of consumers within a patchy
                  environment should correspond to the distribution of
                  resources. Tests of these predictions have
                  inappropriately compared ratios of mean resource
                  levels and mean consumer densities, rather than
                  means of ratios. We show that this error, which has
                  propagated through hundreds of studies, leads to a
                  systematic bias: the theory will appear to
                  underestimate the number of consumers occupying poor
                  patches. We explain the correct way to test the
                  ideal free theory and apply it to published data;
                  the classical model is then seen to yield far more
                  accurate predictions than previously thought.},
  journal =	 {Proc. Roy. Soc. Lond. B},
  year =	 1997,
  volume =	 264,
  pages =	 {1671--1675},
  pdf =		 {bio-learning/earn_johnstone_97.pdf},
  entered_on =	 {01/19/05},
  comments =	 {I think that Appendix 1 (derivation of crucial
                  results) is simply wrong. The results may still hold
                  though.},
}

@article{balkovsky-shraiman-02,
  journal =	 {Proc. natl. Acad. Sci},
  year =	 2002,
  volume =	 99,
  number =	 20,
  pages =	 {12589--12593 },
  title =	 { Olfactory search at high Reynolds number},
  author =	 { E Balkovsky and B Shraiman},
  abstract =	 { Locating the source of odor in a turbulent
                  environmenta common behavior for living organismsis
                  nontrivial because of the random nature of
                  mixing. Here we analyze the statistical physics
                  aspects of the problem and propose an efficient
                  strategy for olfactory search that can work in
                  turbulent plumes. The algorithm combines the maximum
                  likelihood inference of the source position with an
                  active search. Our approach provides the theoretical
                  basis for the design of olfactory robots and the
                  quantitative tools for the analysis of the observed
                  olfactory search behavior of living creatures (e.g.,
                  odor-modulated optomotor anemotaxis of moths).},
  entered_on =	 {07/21/04},
  pdf =		 {bio-learning/balkovsky-shraiman-02.pdf},
  comments =	 {Smell propagates in the wind in terms of long lived
                  localized smell patches. While the algorithm they
                  produce is most probably OK, the derivation is
                  lacking. They claim that each arriving patch reduces
                  the uncertainty about the source, and the best
                  strategy is to move in the direction of the
                  patch. Why? A better way to formulate the problem
                  would be as follows. Write down the entropy of the
                  source location, and choose the trajectories that
                  minimize such entropy (maximum MI between source and
                  next step). This will have to be aveaged over random
                  plume positions. Notice that this will take into the
                  account the fact that maximum-likelihood like
                  methods wont work -- if by mistake you exit the
                  plume cone, you will never find the source.},
}

@article{brenner-00,
  journal =	 {Neuron},
  volume =	 26,
  pages =	 {695--702},
  year =	 2000,
  title =	 {Adaptive Rescaling Maximizes Information
                  Transmission},
  author =	 {Naama Brenner and William Bialek and Rob de Ruyter
                  van Steveninck},
  pdf =		 {bio-learning/brenner_etal_00.pdf},
  abstract =	 {Adaptation is a widespread phenomenon in nervous
                  systems, providing flexibility to function under
                  varying external conditions. Here, we relate an
                  adaptive property of a sensory system directly to
                  its function as a carrier of information about input
                  signals. We show that the input/output relation of a
                  sensory system in a dynamic environment changes with
                  the statistical properties of the
                  environment. Specifically, when the dynamic range of
                  inputs changes, the input/output relation rescales
                  so as to match the dynamic range of responses to
                  that of the inputs. We give direct evidence that the
                  scaling of the input/output relation is set to
                  maximize information transmission for each
                  distribution of signals. This adaptive behavior
                  should be particularly useful in dealing with the
                  intermittent statistics of natural signals.},
  entred_on =	 {06/02/03},
  comments =	 {As the abstract says, the paper analyzes reponse of
                  the H1 neuron two slow and fast (compared to the fly
                  behavioral response time of 30ms) changing velocity
                  stimulus. In both cases it is evident that the fly
                  adpats its coding strategy to the stimulus, and it
                  looks that adaptation is such as to maximize the
                  information transfer.},
}

@article{deweese-zador-98,
  title =	 {Asymmetric Dynamics in Optimal Variance Adaptation},
  author =	 {Michael DeWeese and Anthony Zador},
  abstract =	 {It has long been recognized that sensory systems
                  adapt to their inputs. Here we formulate the problem
                  of optimal variance estimation for a broad class of
                  nonstationary signals. We show that under weak
                  assumptions, the Bayesian optimal causal variance
                  estimate shows asymmetric dynamics: an abrupt
                  increase in variance is more readily detectable than
                  an abrupt decrease. By contrast, optimal adaptation
                  to the mean displays symmetric dynamics when the
                  variance is held fixed. After providing several
                  empirical examples and a simple intuitive argument
                  for our main result, we prove that optimal
                  adaptation is asymmetrical in a broad class of model
                  environments. This observation makes specific and
                  falsifiable predictions about the time course of
                  adaptation in neurons probed with certain stimulus
                  ensembles.},
  journal =	 {Neural Comput.},
  volume =	 10,
  pages =	 {1179--1202},
  year =	 1998,
  pdf =		 {bio-learning/deweese-zador-98.pdf},
  entered_on =	 {05/30/03},
  comments =	 {The paper shows that it's more difficult to see a
                  variance decrease than the decrease (provided, tails
                  of the underlying distributions are not too
                  heavy). They use causal estimation, where the value
                  of the variance at the next point depends on only
                  the previous ones, and only through the last
                  point.  It is not obvious that, once a new datum
                  comes in, one should not revisit one's earlier
                  estimate, and do a re-estimation (batch
                  vs. online). Then the precedure will become similar
                  to Kalman filtering -- prediction/correction
                  scheme. This deserves further attention.},
}

@article{fairhall-02,
  title =	 {Efficiency and ambiguity in an adaptive neural code},
  author =	 {Adrienne L.~Fairhall and Geoffrey D.~Lewen and
                  William Bialek and Robert R.~de Ruyter van
                  Steveninck},
  abstract =	 {We examine the dynamics of a neural code in the
                  context of stimuli whose statistical properties are
                  themselves evolving dynamically. Adaptation to these
                  statistics occurs over a wide range of
                  timescales -- from tens of milliseconds to
                  minutes. Rapid components of adaptation serve to
                  optimize the information that action potentials
                  carry about rapid stimulus variations within the
                  local statistical ensemble, while changes in the
                  rate and statistics of action-potential Â®ring encode
                  information about the ensemble itself, thus
                  resolving potential ambiguities. The speed with
                  which information is optimized and ambiguities are
                  resolved approaches the physical limit imposed by
                  statistical sampling and noise.},
  entered_on =	 {05/30/03},
  pdf =		 {bio-learning/fairhall_etal_02.pdf},
  journal =	 {Nature},
  year =	 2002,
  volume =	 {412},
  issue =	 23,
  pages =	 {787--792},
  comments =	 {The paper discsusses adaptation of the neural code
                  in the fly to velocity signal with zero mean
                  velocity and different variances. The following
                  effects are noticed. For periodic variance
                  modulation, adaptation seems to have a time scale
                  proportinal to the period. Adaptation to higher
                  variance happens faster tha to lower
                  variance. Information about the signal per spike is
                  (almost) the same irrespective of the signal
                  variance. It slightly drops when the variance is
                  abruptly shifted downwards, but then
                  recovers. Information per spike about the signal
                  variance seems to be constant. The paper suggests
                  that precise spike placement encodes precise
                  temporal structure of the signal, while the longer
                  (averaged) properties encode large scale attributes
                  of the signal (its variance). If the rate of changes
                  in the variance and local properties is about the
                  same (like in variance switching), then these two
                  are mixed with each other. Paper derives limits on
                  the physical limits on the speed of adaptation, but
                  I don't follow the derivation.}
}


@Article{gallistel-etal-01,
  journal =	 {Journal of Experimental Psychology: Animal Behavior
                  Processes},
  year =	 2001,
  volume =	 27,
  pages =	 {354--372},
  title =	 {The Rat Approximates an Ideal Detector of Changes in
                  Rates of Reward: Implications for the Law of Effect},
  author =	 {C.~R.~Gallistel and Terence A.~Mark and Adam King
                  and P.~E.~Latham},
  pdf =		 {bio-learning/gallistel_etal_01.pdf},
  abstract =	 {Rats responded on two levers delivering brain
                  stimulation reward on concurrent variable interval
                  schedules. Following many successive sessions with
                  unchanging relative rates of reward, subjects
                  adjusted to an eventual change slowly and showed
                  spontaneous reversions at the beginning of following
                  sessions. When changes in rates of reward occurred
                  between and within every session, subjects adjusted
                  to them about as rapidly as they could in principle
                  do so, as shown by comparison to a Bayesian model of
                  an ideal detector. This and other features of the
                  adjustments to frequent changes imply that the
                  behavioral effect of reinforcement depends on the
                  subject's perception of incomes and changes in
                  incomes rather than on the strengthening and
                  weakening of behaviors in accord with their past
                  effects or expected results. Models for the process
                  by which perceived incomes determine stay durations
                  and for the process that detects changes in rates
                  are developed.},
  entered_on =	 {04/07/2003, 03/10/2004},
  comments =	 {The paper deals with how rats are capable of feeling
                  changes in the statistics of probability
                  distribution of rewards and adapting to those
                  changes (the rewards are essentially Poisson, and
                  the rates change at some point). It has to be noted
                  that the paper only deals with rats' reaction to
                  changes; however, as Gallistel himself noted to me
                  it is unclear that the rat, even after feeling the
                  change in the rates, will quickly adapt its behavior
                  -- it may have its own reason for doing or not doing
                  so. In this sense the results of the paper are only
                  bounds on the real learning happening in
                  rats. Experiments with more pressing rewards (pain
                  maybe?) should be done to remove the lag (if any)
                  between learning about the change and acting upon
                  it. I find the following points in rats behavior
                  very interesting. (1) For random rewards, rats
                  apparently adopt a strategy of random visits to
                  different reward sites [IMHO, this is not well
                  proven in the paper: while stay times at a reward
                  site do look random, I don't see an analysis showing
                  that the binary sequence of visits (to one site, or
                  to the other) is also random]. (2) After a very long
                  period of constant rewards rate, it takes very long
                  time for a rat to adjust to rate changes; the
                  adjustments are very slow. If the change is large,
                  adjustment happens slightly faster. This smells like
                  a hysteresis effect (it's always been this way; if
                  it changes, it's only a fluke). (3) If a rate has
                  long been constant and then changed, and the rat has
                  started to adjust, and then the experiment is
                  stopped and restarted after a while, then the rat
                  reverts to the pre--change response (again, treating
                  the changes as a fluke). (4) If changes are many and
                  fast, the behavior adjustments are rapid (with very
                  short delay), almost discontinuous, and almost ideal
                  (though I don't buy the analysis completely -- see
                  below). (5) Transition between long adjustments and
                  step adjustments take only a handfull of
                  changes. (6) Some (but not very convincing) analysis
                  shows that the rat does not base its rates estimates
                  on very few events preceeding the decision;
                  filtering has long time scale to avoid statistical
                  fluctuations. (7) The rat may make (small)
                  spontaneous changes in its behavior and overadjust
                  for happening changes. (8) The paper argues that
                  expected changes in returns due to rat's adjustment
                  are to small to matter; the rat must be optimizing
                  something else. My biggest problems with the paper
                  are with its appendix -- Bayesian analysis of the
                  events. The model involves two constant rates and a
                  change in between. Similalry, the analysis of the
                  rats' response assumes that the rat changes its
                  behavior at some point rather abruptly. Why should a
                  rat have such a discrete model in its head?
                  Shouldn't it be doing continous filtering and
                  continuous adaptation of its response? In many
                  respects the experimental findings of the paper
                  resonate well with Knudsen's famous experiments of
                  prismatic glasses and owls. Though there is an
                  important difference: Fig. 4 of the paper says that
                  "the amount of prior experience with changes in
                  rates of reward does not determine how rapidly a
                  subject completes its adjustment to a change in the
                  relative rates of reward. What matters is the
                  frequency with which such changes have been
                  encountered recently. When changes have been
                  infrequent, the subject takes a long time to
                  complete its adjustment". In contrasts, old owl
                  which had experiences with glasses when young,
                  adapts to the new glasses in Knudsen's
                  experiments. Also, in the spirit of my work
                  \cite{nemenman-04}, maybe it's not only the
                  frequency of changes that matters, but also the
                  amplitude?}
}


@misc{chen-etal-04,
note = {arXiv: q-bio/0402021},
title= {An Exact Model of Fluctuations in Gene Expression},
author = {W Chen and J England and E Shakhnovich},
abstract ={Fluctuations in the measured mRNA levels of unperturbed
                  cells under fixed conditions have often been viewed
                  as an impediment to the extraction of information
                  from expression profiles. Here, we argue that 
                  such expression fluctuations should themselves be
                  studied as a source of valuable information about
                  the underlying dynamics of genetic networks. By
                  analyzin gmicroarray data taken from Saccharomyces
                  cerevisiae, we demonstrate that correlations in
                  expression fluctuations have a highly statistically
                  significant dependence on gene function, and
                  furthermore exhibit a remarkable scale-free network
                  structure. We therefore present what we view to be
                  the simplest phenomenological model of a genetic
                  network which can account for the presence of
                  biological information in transcript level
                  fluctuations. We proceed to exactly solve this model
                  using a path integral technique and derive several
                  quantitative predictions. Finally, we propose
                  several experiments by which these predictions might
                  be rigorously tested.},
year = 2004, 
pdf = {bionets/chen-etal-04.pdf},
comments = {The authors start with a clearly incorrect premise that
                  the noise observed in mRNA expression data is
                  largely due to the intrinsic stochastic
                  fluctuations. This is wrong -- even at a current
                  stage (2-3 years after the paper was written), the
                  largest source of noise is the population
                  variability and the hybridization variability, not
                  the intrinsic stochasticity. Note that in our ARACNE
                  work, the noise was due to the fluctuations of
                  kinetic rates, not due to the intrinsic fluctuations
                  -- thus we modeled cell-to-cell variability this
                  way. The current paper further
                  develops the work of Collins and Gardner on the
                  linear response of genesto small deviations from the
                  steady state by adding some noise to these linear
                  response equations. But they use unform noise?! And
                  uncorrelated among genes?! Further, eq 16 is clearly
                  true only for constant, uncorrelated noise -- the
                  relationship would  be much more complicated
                  otherwise. Once one allows the noise covariance
                  matrix to have >>1 independent elements, the
                  relation of the linear response coefficents to the
                  observed response fluctuations becomes dependent on
                  this entire multi-parameter unknown noise covariance
                  matrix, so that the relation is almost useless for
                  inference tasks. It's also surprizing that, given
                  that the authors here do just the FDT analysis for a
                  very specific noise spectrum, they never actually
                  mention FDT explicitely. The discussion after Eq 17
                  is interesting though, suggesting that if the
                  network of interactions is clustered, than the
                  network might be inferrable from the fluctuations.},
entered_on = {07/14/06},

}
@article{chen-wang-06,
title = {On the attenuation and amplification of molecular noise in
                  genetic regulatory 
                  networks},
journal = {BMC Bioinformatics},
year =  2006,
volume = 7,
pages = {52},
author = {B-S Chen and Y-C Wang},
abstract ={Background: Noise has many important roles in cellular
                  genetic regulatory functions at the nanomolar
                  scale. At present, no good theory exists for
                  identifying all possible mechanisms of genetic
                  regulatory networks to attenuate the molecular noise
                  to achieve regulatory ability or to amplify the
                  molecular noise to randomize outcomes to the
                  advantage of diversity. Therefore, the noise
                  filtering of genetic regulatory network is an
                  important topic for gene networks under intrinsic
                  fluctuation and extrinsic noise. Results: Based on
                  stochastic dynamic regulation equation, the
                  intrinsic fluctuation in reaction rates is modeled
                  as a state-dependent stochastic process, which will
                  influence the stability of gene regulatory network,
                  especially, with low concentrations of reacting
                  species. Then the mechanisms of genetic regulatory
                  network to attenuate or amplify extrinsic
                  fluctuation are revealed from the nonlinear
                  stochastic filtering point of view. Furthermore, a
                  simple measure of attenuation level or amplification
                  level of extrinsic noise for genetic regulatory
                  networks is also introduced by nonlinear robust
                  filtering method. Based on the global linearization
                  scheme, a convenient method is introduced to measure
                  noise attenuation or amplification for each gene of
                  the nonlinear stochastic regulatory network by
                  solving a set of filtering problems, which
                  correspond to a set of linearized stochastic
                  regulatory networks. Finally, by the proposed
                  methods, several simulation examples of genetic
                  regulatory networks are given to measure their
                  robust stability under intrinsic fluctuations, and
                  to estimate the genesâ€™ attenuation and amplification
                  levels under extrinsic noises. Conclusions: In this
                  study, a stochastic nonlinear dynamic model is
                  developed for genetic regulatory networks under
                  intrinsic fluctuation and extrinsic noise. By the
                  method we proposed, we could determine the robust
                  stability under intrinsic fluctuations and identify
                  the genes that are significantly affected by
                  extrinsic noises, which we call the weak structure
                  of the network. This method will be potential for
                  robust gene circuit design in future, on which a
                  drug design could be based.}, 
pdf = {bionets/cheng-wang-06.pdf},
comments ={},
}

@article{klemm-bronholdt-05,
  journal =	 {Proc. Natl. Acad. Sci},
year = 2005,
volume =102,
pages = {18414-18419},
title= {Topology of biological networks and reliability of information processing},
abstract = {Survival of living cells and organisms is largely based on
                  highly reliable function of their regulatory
                  networks. However, the elements of biological
                  networks, e.g., regulatory genes in genetic networks
                  or neurons in the nervous system, are far from being
                  reliable dynamical elements. How can networks of
                  unreliable elements perform reliably? We here
                  address this question in networks of autonomous
                  noisy elements with fluctuating timing and study the
                  conditions for an overall system behavior being
                  reproducible in the presence of such noise. We find
                  a clear distinction between reliable and unreliable
                  dynamical attractors. In the reliable case,
                  synchrony is sustained in the network, whereas in
                  the unreliable scenario, fluctuating timing of
                  single elements can gradually desynchronize the
                  system, leading to nonreproducible behavior. The
                  likelihood of reliable dynamical attractors strongly
                  depends on the underlying topology of a
                  network. Comparing with the observed architectures
                  of gene regulation networks, we find that those
                  3-node subgraphs that allow for reliable dynamics
                  are also those that are more abundant in nature,
                  suggesting that specific topologies of regulatory
                  networks may provide a selective advantage in
                  evolution through their resistance against noise.},
pdf ={bionets/klemm-bronholdt-05.pdf},
comments = {The opening statements about reliability of neuronal
                  firing and protein concentrations are questionable,
                  just as many papers that they refer to. The paper is
                  not about information processing--there are never
                  any signals. Hill time, noiseless coupling is
                  considered, but with delay. (Why? either we have
                  well-mixed system, and then there's no delay, or the
                  system is inhomogeneous, and then PDEs are
                  needed. Or do they mean delay due to, say,
                  translation?) Noise is introduced by varying the
                  time delay (weird). Even stranger is the choice of
                  fluctuations: interchanging jumps up and down in the
                  delay with varying amplitudes. Some topologies
                  (i.e., feed forward) show a very small sensitivity
                  to varying delays (but, again, this has little to do
                  with signal processing). Feedback loops loos stable
                  dynamics as a function of fluctuations. },
author = {K Klemm and S Bornholdt},
entered_on = {05/18/06},
}
@article{eldar-etal-03,
  journal =	 {Developmental Cell},
  volume =	 5,
  pages =	 {635â€“646},
  year =	 2003,
  title =	 {Self-Enhanced Ligand Degradation Underlies
                  Robustness of Morphogen Gradients},
  author =	 {A Eldar and D Rosin and B-Z Shilo and N Barkai},
  pdf =		 {bionets/eldar-etal-03.pdf},
  abstract =	 {Morphogen gradients provide long-range positional
                  information by extending across a developing
                  field. To ensure reproducible patterning, their
                  profile is invariable despite genetic or
                  environmental fluctuations. Common models assume a
                  morphogen profile that decays exponentially. Here,
                  we show that exponential profiles cannot, at the
                  same time, buffer fluctuations in morphogen
                  production rate and define long-range gradients. To
                  comply with both requirements, morphogens should
                  decay rapidly close to their source but at a
                  significantly slower rate over most of the
                  field. Numerical search revealed two network designs
                  that support robustness to fluctuations in morphogen
                  production rate. In both cases, morphogens enhance
                  their own degradation, leading to a higher
                  degradation rate close to their source. This is
                  achieved through reciprocal interactions between the
                  morphogen and its receptor. The two robust networks
                  are consistent with properties of the Wg and Hh
                  morphogens in the Drosophila wing disc and provide
                  novel insights into their function.},
  comments =	 {The theory to achieve power law morphogen decays is
                  pretty simple. The screen for robust models,
                  however, is limited only to models that form a
                  superset of two well-kno