@misc{schapire-01,
  howpublished = {MSRI Workshop on Nonlinear Estimation and
                  Classification},
  year =	 2002,
  title =	 {The Boosting Approach to Machine Learning: An
                  Overview},
  author =	 {Robert Schapire},
  abstract =	 {Boosting is ageneral method for improving the
                  accuracy of any given learning algorithm. Focusing
                  primarily on the AdaBoost algorithm, this chapter
                  overviews some of the recent work on boosting
                  including analyses of AdaBoost's training error and
                  generalization error; boosting's connection to
                  gametheory and linear programming; the relationship
                  between boosting and logistic regression; extensions
                  of AdaBoost for multiclassclassification problems;
                  methods of incorporating human knowledge into
                  boosting; and experimental and applied work using
                  boosting. },
  entered_on =	 {03/17/04},
  pdf =		 {algorithms/schapire-01.pdf},
  comments =	 {A good review of AdaBoost.},
}


@article{tibshirani-96,
  title =	 {Regression Shrinkage and selection via the {L}ASSO},
  author =	 {Robert Tibshirani},
  journal =	 {J.\ Roy.\ Stat.\ Soc., Ser.\ B (Method.)},
  volume =	 {58},
  number =	 1,
  year =	 1996,
  pages =	 {267-288},
  entered_on =	 {02/15/04},
  pdf =		 {algorithms/tibshirani-96.pdf},
  abstract =	 {We propose a new method for estimation in linear
                  models. The 'lasso' minimizes the residual sum of
                  squares subject to the sum of the absolute value of
                  the coefficients being less than a constant. Because
                  of the nature of this constraints it tends to
                  produce some coefficients that are exactly 0 and
                  hence gives interpretable models. Our simulation
                  studies suggest tha the lasso enjoys some of the
                  favourable properties of both subset selction and
                  ridge regression. It produces interpretable models
                  like subset selection and exhibits the stability of
                  ridge regression. There is also an interesting
                  relationship with the recent work in adaptive
                  function estimation by Donoho and Johnstone. The
                  lasso idea is quite general and can be applied in a
                  variety of statistical models: extensions to
                  generalized regression models and tree-based models
                  are briefly described.},
  comments =	 {A conceptually clear and well--written paper. The
                  usual idea that one has (a-la Bayesian model
                  selection, or James--Stein shrinkage, or Donoho and
                  Johnstone, 1994) is to shrink some coefficents in,
                  say, simple linear models. This is a principled way
                  to proceed, since "shrinkage" of the coefficients
                  results from averaging over the Bayesian
                  prior. Instead, in lasso one starts we requiring
                  that sum of the absolute values of coefficients in
                  linear models is minimal (this is, by the way, very
                  similar to the standard SVM toolbox). In this case
                  many coeffiecients that would have usually be
                  "small" are shrunk exactly to zero. The minimization
                  problem for lasso has, therefore, the usual two-term
                  structure: goodness of fit (usual quadratic error),
                  and the complexity penalty (the sum of absolute
                  values of coefficients; other penalties also
                  considered as examples). Errors of the lasso
                  estimates are to be estimated by bootstrap. I think
                  that the absval constraint on parameters is not
                  strong enough to be a good "complexity penalty" in
                  an arbitrary case, when the number of predictor
                  variables is huge (possibly infinite). Fig. 1 in
                  this paper gives a good insight: for very large
                  dimensions, the probability that the ellipses of
                  maximum likelihood will hit the constraints
                  hypercube at points where many coefficients are zero
                  (verticies, ridges) becomes very small. Lasso will
                  eliminate many coefficients for
                  infinite--dimensional models, but it won't eliminate
                  most of them. This is in contrast to Bayesian case
                  \cite{nemenman-04}. It might me interesting to do
                  infinite dimensional analysis of lasso to analyze
                  this question. Note also that in this model one must
                  select the comparative weight of the goodness of fit
                  and the complexity term ("t" in the article) by some
                  other mean (e.g., bootstrap): lasso is not a
                  self-suffiecient machine.},
}

@article{efron-etal-03,
  title =	 {Least Angle Regression},
  journal =	 {Ann.\ Stat.},
  volume =	 31,
  year =	 2003,
  entered_on =	 {02/15/04},
  author =	 {Bradley Efron and Trevor Hastie and Iain Johnstone
                  and Robert Tibshirani},
  abstract =	 {The purpose of model selection algorithms such as
                  All Subsets, Forward Selection, and Backward
                  Elimination is to choose a linear model on the basis
                  of the same set of data to which the model will be
                  applied. Typically we have available a large
                  collection of possible covariates from which we hope
                  to select a parsimonious set for the efficient
                  prediction of a response variable. Least Angle
                  Regression ("LARS"), a new model selection
                  algorithm, is a useful and less greedy version of
                  traditional forward selection methods. Three main
                  properties are derived. (1)A simple modification of
                  the LARS algorithm implements the Lasso, an
                  attractive version of Ordinary Least Squares that
                  constrains the sum of the absolute regression
                  coefficients; the LARS modification calculates all
                  possible Lasso estimates for a given problem, using
                  an order of magnitude less computer time than
                  previous methods. (2)A different LARS modification
                  efficiently implements Forward Stagewise linear
                  regression, another promising new model selection
                  method; this connection explains the similar
                  numerical results previously observed for the Lasso
                  and Stagewise, and helps understand the properties
                  of both methods, which are seen as constrained
                  versions of the simpler LARS algorithm. (3) A simple
                  approximation for the degrees of freedom of a LARS
                  estimate is available, from which we derive a Cp
                  estimate of prediction error; this allows a
                  principled choice among the range of possible LARS
                  estimates. LARS and its variants are computationally
                  efficient: the paper describes a publicly available
                  algorithm that requires only the same order of
                  magnitude of computational efort as Ordinary Least
                  Squares applied to the full set of covariates.},
  pdf =		 {algorithms/efron-etal-03.pdf},
  comments =	 {The LARS algorithm is an "algorithm" -- a sequence
                  of steps than one has to undertake. It is not clear
                  which problem is being solved, which lagrangian is
                  being minimized. Thus it is very difficult (for me)
                  to read this paper. Few points (1) the algoithm has
                  connections to lasso, boosting, and forward
                  stagewise regression (See Fig. 5). The performance
                  of these methods is undistinuishable from each other
                  within statistical errors. So, basically, these are
                  all agorithm solving almost the same problem in
                  various algorithmic ways; Lasso has the best
                  explanation of what is really being done -- it has a
                  Lagrangian which is being minimized. Also looking at
                  Fig. 5 we see that (2) neither of these methods
                  (LARS, Lasso, stagewise) is capable of doing proper
                  model capacity control -- once the number of
                  parameters goes above the best, the models start to
                  have large prediction errors. So, together with the
                  algortithms, one still must use some external method
                  (bootstrap? crossvalidation?) for capacity
                  control. (3) Further, we see that, within
                  statistical error, the best possible performance of
                  a simple, unconstrained, regression is as good as of
                  all of these new methods; the best performance for
                  the OLS is actually reached with a lot smaller
                  number of predictors! This performance decays very
                  repidly as more terms are introduced, so one must
                  have a good capacity control mechanism build in --
                  but this is required for LARS and Co. anyway. Then
                  the question is: if, at their best, LARS and
                  derivatives are comparable to OLS, and all require
                  capacity control tools, why shouldn't we go with the
                  easier OLS? In any case, it has a great bayesian
                  model selection going for it!}
}

@article{dempster-etal-77,
  author =	 {A.~P.~Dempster and N.~M.~Laird and D.~B.~Rubin},
  journal =	 {J.\ Roy.\ Stat.\ Soc., Ser.\ B (Methodol.)},
  title =	 {Maximum likelihood from incomplete data via the {EM}
                  algorithm},
  volume =	 39,
  number =	 1,
  year =	 1977,
  pages =	 {1--38},
  entered_on =	 {04/08/03},
  url =		 {algorithms/dempster-laird-rubin-77.djv},
  abstract =	 {A broadly applicble algoritm for computing maximum
                  likelihood estimates from incomplete data is
                  presented at various levels of generality. Theory
                  showing the monotone beaviour of the likelihood and
                  the convergence of the algorithm is derived. Many
                  examples are sketched, including missing value
                  situations, applications to grouped, censored or
                  truncated data, finite mixture models, variance
                  component estimation, hyperparameter estimation,
                  iteratively reweighted least squares and factor
                  analysis.},
  comments =	 {This paper introduces EM algorithm. One of the most
                  notable examples it gives is about hyperparameter
                  estimation, which is roughly equivalent to Bayesian
                  model selection in today's language.},
}


@Article{hasselblad-69,
  author =	 {Victor Hasselblad},
  title =	 {Estimation of finite mixtures of distributions from
                  the exponential family},
  journal =	 {J.\ Amer.\ Stat.\ Assoc.},
  year =	 {1969},
  volume =	 {64},
  number =	 328,
  month =	 Dec,
  pages =	 {1459--1471},
  url =		 {algorithms/hasselblad-69.djv},
  entered_on =	 {04/09/03},
  abstract =	 {General ''successive substitutions'' iteration
                  equations are developed for obtaining estimates for
                  finite mixtures of ditributions from the xponential
                  family. These, in general, correspond to relative
                  maximums of the likelihood function. It is assumed
                  that the number of distributions is known, and that
                  the mixtures are from distributions of the same
                  type, but with different parameter values. The
                  particular equations for the Poisson, binomial, and
                  exponential distributions are given, as well as
                  examples of the results of the procedure for each
                  distribution. From the examples tried, it was
                  observed that the likelihood function increased at
                  each iteration. Graphs of the asymptotic variances
                  of the estimates are givenm and two sampling
                  experiments comparing estimates obtained by this
                  scheme with moment estimates are also given.},
  comments =	 {The method discussed in this paper is a precursor of
                  the EM algorithm, \cite{dempster-etal-77}. EM is
                  used to estimate the mixture parameters in this
                  paper.},
}

@article{kurdistani-etal-02,
  title =	 {Genome-wide binding map of the histone deacetylase
                  Rpd3 in yeast},
  author =	 {S Kurdistani and D Robyr and S Tavazoie and M
                  Grunstein},
  abstract =	 {We describe the genome-wide distribution of the
                  histone deacetylase and repressor Rpd3 and its
                  associated proteins Ume1 and Ume6 in Saccharomyces
                  cerevisiae. Using a new cross-linking protocol, we
                  found that Rpd3 binds upstream of many individual
                  genes and upstream of members of gene classes with
                  similar functions in anabolic processes. In
                  addition, Rpd3 is preferentially associated with
                  promoters that direct high transcriptional
                  activity. We also found that Rpd3 was absent from
                  large sub-telomeric domains. We show by
                  co-immunoprecipitation and by the high similarity of
                  their binding maps that Ume1 interacts with Rpd3. In
                  contrast, despite the known role of Ume6 in Rpd3
                  recruitment, only a limited number of the genes
                  targeted by Rpd3 are also enriched for (or targeted
                  by) Ume6. This suggests that Rpd3 is brought to many
                  promoters by alternative recruiters, some of which
                  may bind the putative cis-regulatory DNA elements
                  that we have identified in sets of Rpd3 target
                  genes. Finally, we show that comparing the
                  genome-wide pattern of Rpd3 binding with gene
                  expression and histone acetylation in the
                  rpd3mutant strain reveals new sites of Rpd3
                  function.},
  journal =	 {Nat. Genet.},
  volume =	 31,
  year =	 2002,
  pages =	 {248-254},
  pdf =		 {bioinformatics/kurdistani-etal-02.pdf},
  entered_on =	 {08/05/05},
  comments =	 {},
}


@article{ronen-etal-02,
  title =	 {Assigning numbers to the arrows: Parameterizing a
                  gene regulation network by using accurate expression
                  kinetics},
  author =	 {M Ronen and R Rosenberg and B Shraiman and U Alon},
  abstract =	 {A basic challenge in systems biology is to
                  understand the dynamical behavior of gene regulation
                  networks. Current approaches aim at determining the
                  network structure based on genomic-scale
                  data. However, the network connectivity alone is not
                  sufficient to define its dynamics; one needs to also
                  specify the kinetic parameters for the regulation
                  reactions. Here, we ask whether effective kinetic
                  parameters can be assigned to a transcriptional
                  network based on expression data. We present a
                  combined experimental and theoretical approach based
                  on accurate high temporal-resolution measurement of
                  promoter activities from living cells by using green
                  fluorescent protein (GFP) reporter plasmids. We
                  present algorithms that use these data to assign
                  effective kinetic parameters within a mathematical
                  model of the network. To demonstrate this, we employ
                  a well defined network, the SOS DNA repair system of
                  Escherichia coli. We find a strikingly detailed
                  temporal program of expression that correlates with
                  the functional role of the SOS genes and is driven
                  by a hierarchy of effective kinetic parameter
                  strengths for the various promoters. The calculated
                  parameters can be used to determine the kinetics of
                  all SOS genes given the expression profile of just
                  one representative, allowing a significant reduction
                  in complexity. The concentration profile of the
                  master SOS transcriptional repressor can be
                  calculated, demonstrating that relative protein
                  levels may be determined from purely transcriptional
                  data. This finding opens the possibility of
                  assigning kinetic parameters to transcriptional
                  networks on a genomic scale.},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  year =	 2002,
  volume =	 99,
  number =	 16,
  pages =	 {10555-10560},
  entered_on =	 {08/04/05},
  comments =	 {The authors measure expression of RNA's of single
                  genes in the SOS module to high precision and then
                  fit the measurements to Michaelis-Menten kinetic for
                  both the expression and the involved protein
                  level. This way they find out all kinetic constants
                  (but, recall, they start from the known topology of
                  the network. Nothing extraordinary here.},
  pdf =		 {bioinformatics/ronen-etal-02.pdf},
}
@article{mcadams-arkin-98m,
  title =	 {simulation of prokaryotic gene circuits},
  author =	 {H McAdams and A Arkin},
  abstract =	 {Biochemical and genetic approaches have identified
                  the molecular mechanisms of many genetic reactions,
                  particularly in bacteria. Now a comparably detailed
                  understanding is needed of how groupings of genes
                  and related protein reactions interact to
                  orchestrate cellular functions over the cell cycle,
                  to implement preprogrammed cellular development, or
                  to dynamically change a cell's processes and
                  structures in response to environmental
                  signals. Simulations using realistic,
                  molecular-level models of genetic mechanisms and of
                  signal transduction networks are needed to analyze
                  dynamic behavior of multigene systems, to predict
                  behavior of mutant circuits, and to identify th
                  edesign principles applicable to design of genetic
                  regulatory circuits. When the underlying design
                  rules for regulatory circuits are understood, it
                  will be far easier to recognize common circuit
                  motifs, to identify functions of individual proteins
                  in regulation, and to redesign circuits for altered
                  functions. },
  journal =	 {Annu. Rev. Biophys. Biomol. Struct.},
  year =	 1998,
  volume =	 27,
  pages =	 {199-224},
  comments =	 {Good introductory review of things from
                  biochemistry, noise, to chemotaxis, lambda-phage
                  switch, cell cycle. Good literature collection circa
                  1988.},
  pdf =		 {bioinformatics/mcadams_arkin_98.pdf},
  entered_on =	 {08/01/05},
}

@article{yeung-02,
  title =	 {Reverse engineering of gene networks using singular
                  value decomposition and robust regression},
  author =	 {MKS Yeung and J Tegner and JJ Collins},
  abstract =	 {We propose a scheme to reverse-engineer gene
                  networks on a genome-wide scale using a relatively
                  small amount of gene expression data from microarray
                  experiments. Our method is based n the empirical
                  observation that such networks are typically large
                  and sparse. it uses singular value decomposition to
                  construct a family of candidate solutions and then
                  uses robust regression to identify the solution with
                  the smallest number of connections as the most
                  likely solution. Our algorithm has O(logN) sampling
                  complexty and O(N^4) computational complexity. we
                  test and validate our approach in a series of in
                  numero experiments on model gene networks.},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  volume =	 99,
  year =	 2002,
  number =	 9,
  pages =	 {6163--6168},
  pdf =		 {bioinformatics/yeung-tegner-collins-02.pdf},
  entered_on =	 {08/02/05},
  comments =	 {See also \cite{alter-etal-00}. This is relevant to
                  \cite{wiggins-nemenman-03} -- same linearization of
                  the dynamics near the steady state. They use L1
                  regression (Lasso) as a complexity control
                  mechanism. This is questionable in higher dimensions
                  (see my comments on Lasso), since this complexity
                  control measure leads to a fraction of all possible
                  parameters to be nonzero, not a low power of the
                  number of the parameters. The key point of the paper
                  is the linearization of the dynamics near the steady
                  state after a small transient perturbation. Nicely,
                  one can reconstruct networks this way element by
                  element, focusing on a netwrok around a single gene
                  without analyzing the whole networks. In many
                  respects, ARACNE is the generalization of this
                  algorithms without the linearity assumption and with
                  stricter capacity control mechanisms.},
}

@article{li-97,
  author =	 {W Li},
  title =	 {The Complexity of {DNA}: {T}he measure of
                  compositional heterogeneity in DNA sequences and
                  measures of complexity.},
  journal =	 {Complexity},
  volume =	 3,
  number =	 2,
  pages =	 {33-37},
  entered_on =	 {08/01/05},
  pdf =		 {bioinformatics/li_97.pdf},
  comments =	 {Long range correlation in DNA. Potentially useful
                  for \cite{bnt-01} since defines some arbitrary
                  complexity measure.},
  year =	 1997,
} 

@article{li-etal-94,
  author =	 {W Li and T Marr and K Kaneko},
  year =	 1994,
  title =	 "Understanding long-range correlations in DNA
                  sequences",
  journal =	 {Physica D (Nonlinearity)},
  volume =	 75,
  pages =	 {392-416},
  pdf =		 {bioinformatics/li_marr_koneko_94.pdf},
  note =	 {erratum: Physica D, 82:217 (1995)},
  entered_on =	 {08/01/05},
  commentrs =	 {Long range correelations and 1/f spectra in genomic
                  data.},
  abstract =	 {In this paper, we review the literature on
                  statistical long-range correlation in DNA
                  sequences. We examine the current evidence for these
                  correlations, and conclude that a mixture of many
                  length scales (including some relatively long ones)
                  in DNA sequences is responsible for the observed
                  1/f-like spectral component. We note the complexity
                  of the correlation structure in DNA sequences. The
                  observed complexity often makes it hard, or
                  impossible, to decompose the sequence into a few
                  statistically stationary regions. We suggest that,
                  based on the complexity of DNA sequences, a fruitful
                  approach to understand long-range correlation is to
                  model duplication, and other rearrangement
                  processes, in DNA sequences. One model, called
                  "expansion-modification system", contains only point
                  duplication and point mutation. Though simplistic,
                  this model is able to generate sequences with 1/f
                  spectra. We emphasize the importance of DNA
                  duplication in its contribution to the observed
                  long-range correlation in DNA sequences. },
}
@article{alter-etal-00,
  title =	 {Singular value decomposition for genome-wide
                  expression data processing and modeling},
  author =	 {O Alter and PO Brown and D Botstein},
  abstract =	 {We describe the use of singular value decomposition
                  in transforming genome-wide expression data from
                  genes   arrays space to reduced diagonalized
                  eigengenes   eigenarrays space, where the
                  eigengenes (or eigenarrays) are unique orthonormal
                  superpositions of the genes (or arrays). Normalizing
                  the data by filtering out the eigengenes (and
                  eigenarrays) that are inferred to represent noise or
                  experimental artifacts enables meaningful comparison
                  of the expression of different genes across
                  different arrays in different experiments. Sorting
                  the data according to the eigengenes and eigenarrays
                  gives a global picture of the dynamics of gene
                  expression, in which individual genes and arrays
                  appear to be classified into groups of similar
                  regulation and function, or similar cellular state
                  and biological phenotype, respectively. After
                  normalization and sorting, the significant
                  eigengenes and eigenarrays can be associated with
                  observed genome-wide effects of regulators, or with
                  measured samples, in which these regulators are
                  overactive or underactive, respectively.},
  entered_on =	 {08/01/05},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  year =	 2000,
  volume =	 97,
  number =	 18,
  pages =	 {10101-10106},
  pdf =		 {bioinformatics/alter-etal-00.pdf},
  comments =	 {One of the first microarray SVD papers; a few
                  datasets are analyzed, it's shown that few
                  components capture most of the variance.},
}


@article{nowicka-etal-01,
  note =	 {arXiv: cond-mat/0102348},
  title =	 {Long-Tail Feature of DNA Words Over- and
                  Under-Representation in Coding Sequences},
  author =	 {A Nowicka and M Dudek and S Cebrat and M Kowalczuk
                  and P Mackiewicz and M Dudkiewicz and D Szczepanik},
  journal =	 {Comp. Meth. Sci. Technol.},
  volume =	 6,
  pages =	 {65--71},
  year =	 2000,
  abstract =	 {We have analyzed DNA sequences of known genes from
                  16 yeast chromosomes (Saccharomyces cerevisiae) in
                  terms of oligonucleotides. We have noticed that the
                  relative abundances of oligonucleotide usage in the
                  genome follow a long-tail Levy-like distribution. We
                  have observed that long genes often use strongly
                  over-represented and under-represented nucleotides,
                  whereas it was not the case for the short genes
                  (shorter than 300 nucleotides) under
                  consideration. If selection on the extremely
                  over-represented/under-represented oligonucleotides
                  was strong, long genes would be more affected by
                  spontaneous mutations than short ones.},
  comments =	 {Good quick review of statistics of nucleotide
                  frequencies. Lefi flights, Zipf law in nucleotide
                  frequencies.},
  entered_on =	 {08/01/05},
  pdf =		 {bioinformatics/nowicka-etal-01.pdf},
}

@misc{chattopadhyay-etal-01,
  howpublished = {arXiv: physics/0102043},
  title =	 {Statistical Approach to Gene Evolution},
  abstract =	 {The evolution in coding DNA sequences brings new
                  flexibility and freedom to the codon words, even as
                  the underlying nucleotides get significantly
                  ordered. These curious contra-rules of gene
                  organisation are observed from the distribution of
                  words and the second moments of the nucleotide
                  letters. These statistical data give us the physics
                  behind the classification of bacteria.},
  author =	 {S Chattopadhyay and W Kanner and J Chakrabarti},
  entered_on =	 {08/01/05},
  comments =	 {Zipf law, etc., in codon distribution.}
}

@article{holter-etal-01,
  title =	 {Dynamic modeling of gene expression data},
  author =	 {N Holter and A Maritan and M Cieplak and N Fedoroff
                  and J Banavar},
  abstract =	 {We describe the time evolution of gene expression
                  levels by using a time translational matrix to
                  predict future expression levels of genes based on
                  their expression levels at some initial time. We
                  deduce the time translational matrix for previously
                  published DNA microarray gene expression data sets
                  by modeling them within a linear framework by using
                  the characteristic modes obtained by singular value
                  decomposition. The resulting time translation matrix
                  provides a measure of the relationships among the
                  modes and governs their time evolution. We show that
                  a truncated matrix linking just a few modes is a
                  good approximation of the full time translation
                  matrix. This finding suggests that the number of
                  essential connections among the genes is small.},
  journal =	 {Proc. Natl. Acad. Sci},
  year =	 2001,
  volume =	 98,
  number =	 4,
  pages =	 {1693--1698},
  pdf =		 {bioinformatics/holter-etal-01.pdf},
  entered_on =	 {08/01/05},
  comments =	 {Pretty similar to \cite{wiggins-nemenman-03}. This
                  paper doesn't much deal with overfitting, and I
                  think they overfit.},
}


@article{brown-etal-99,
  title =	 {Knowledge-based analysis of microarray gene
                  expression data by using support vector machines},
  author =	 {M Brown and WN Grundy and D Lin and N Cristianini
                  and CW Sugnet T Furey and M Ares and D Haussler},
  abstract =	 {We introduce a method of functionally classifying
                  genes by using gene expression data from DNA
                  microarray hybridization experiments. The method is
                  based on the theory of support vector machines
                  (SVMs). SVMs are considered a supervised computer
                  learning method because they exploit prior knowledge
                  of gene function to identify unknown genes of
                  similar function from expression data. SVMs avoid
                  several problems associated with unsupervised
                  clustering methods, such as hierarchical clustering
                  and self-organizing maps. SVMs have many
                  mathematical features that make them attractive for
                  gene expression analysis, including their
                  flexibility in choosing a similarity function,
                  sparseness of solution when dealing with large data
                  sets, the ability to handle large feature spaces,
                  and the ability to identify outliers. We test
                  several SVMs that use different similarity metrics,
                  as well as some other supervised learning methods,
                  and find that the SVMs best identify sets of genes
                  with a common function using expression
                  data. Finally, we use SVMs to predict functional
                  roles for uncharacterized yeast ORFs based on their
                  expression data.},
  pages =	 {262-267},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  year =	 2000,
  volume =	 97,
  number =	 1,
  entered_on =	 {08/01/05},
  pdf =		 {bioinformatics/brown-etal-00.pdf},
  comments =	 {Uses SVMs to classify genes by their expressions,
                  where the training set consists of genes of known
                  function (belonging to the same functional class).},
}

@misc{gorban-etal-01,
  howpublished = {arXiv: physics/0108016},
  title =	 {Self-organizing Approach for Automated Gene
                  Identification in Whole Genomes},
  author =	 {AN Gorban and A Zinovyev and T Popova},
  abstract =	 {An approach based on using the idea of distinguished
                  coding phase in explicit form for identification of
                  protein-coding regions (exons) in whole genome has
                  been proposed. For several genomes an optimal window
                  length for averaging GC-content function and
                  calculating codon frequencies has been
                  found. Self-training procedure based on clustering
                  in multidimensional space of triplet frequencies is
                  proposed. For visualization of data in the space of
                  triplet requiencies method of elastic maps was
                  applied.},
  entered_on =	 {08/01/05},
  pdf =		 {bioinformatics/gorban-etal-01.pdf},
  comments =	 {Use GC content and the similarity between one-base
                  shifted frequencies of bases to distinguish coding
                  regions. Isn't this not a problem any more?},
  url =		 {http://arxiv.org/abs/physics/0108016},
}

@article{friedman-etal-00,
  title =	 {Using Bayesian Networks to Analyze Expression Data},
  author =	 {N Friedman and M Linial and I Nachman and D Peer},
  abstract =	 {DNA hybridization arrays simultaneously measure the
                  expression level for thousands of genes. These
                  measurements provide a snapshot of the cells
                  transcriptions. A major challenge in computational
                  biology is to uncover, from such measurements,
                  gene/protein interactions and key biological
                  features of the cellular system. In this paper, we
                  propose a new framework for discovering interactions
                  between genes based on multiple expression
                  measurements. This framework builds on the use of
                  Bayesian networks for representing statistical
                  dependencies. A Bayesian network is a graphical
                  model of joint multivariate probability
                  distributions that captures properties of
                  conditional independence between variables. Such
                  models are attractive for their ability to describe
                  complex stochastic processes, and for providing
                  clear methodologies for learning from (noisy)
                  observations. We start by showing how Bayesian
                  networks can describe interactions between genes. We
                  then present an efficient algorithm capable of
                  learning such networks and a statistical method to
                  assess our confidence in their features. Finally, we
                  apply this method to the S. cerevisae cell-cycle
                  measurements of Spellman et al. (1998) to uncover
                  biological features. },
  pdf =		 {bioinformatics/friedman-etal-00.pdf},
  comments =	 {Standard Bayes Nets for expression data; one of the
                  first such papers.},
  journal =	 {J. Comput. Biol.},
  year =	 2000,
  volume =	 7,
  number =	 {3-4},
  pages =	 {601-620},
}

@inproceedings{nachman-etal-05,
  title =	 {Inferring Quantitative Models of Regulatory Networks
                  From Expression Data},
  author =	 {I Nachman and A Regev and N Friedman},
  abstract =	 {Motivation: Genetic networks regulate key processes
                  in living cells. Various methods have been suggested
                  to reconstruct network architecture from gene
                  expression data. However, most approaches are based
                  on qualitative models that provide only rough
                  approximations of the underlying events, and lack
                  the quantitative aspects that are critical for
                  understanding the proper function of biomolecular
                  systems. <p>Results: We present fine-grained
                  dynamical models of gene transcription and develop
                  methods for reconstructing them from gene expression
                  data within the framework of a generative
                  probabilistic model. Unlike previous works, we
                  employ quantitative transcription rates, and
                  simultaneously estimate both the kinetic parameters
                  that govern these rates, and the activity levels of
                  unobserved regulators that control them. We apply
                  our approach to expression data sets from yeast and
                  show that we can learn the unknown regulator
                  activity profiles, as well as the binding affinity
                  parameters. We also introduce a novel structure
                  learning algorithm, and demonstrate its power to
                  accurately reconstruct the regulatory network from
                  those data sets.},
  booktitle =	 {Proc. ISMB'04},
  year =	 2005,
  entered_on =	 {02/02/05},
  comments =	 {Models gene expression asproduction due to activity
                  of some regulators (activated TFs), and mRNA
                  decay. Algorithm fits for a small number of
                  regulators with some time profiles that can explain
                  the observed mRNA activity (with some noise, of
                  course). No way is suggested for identification of
                  the regulator with a gene that produces it. That is,
                  a reconstructed network has only regulator-gene
                  connections, and expression of genes is not used to
                  model expression of regulators (this should be
                  correctable).},
}

@article{zhou-etal-04,
  title =	 {A Bayesian connectivity-based approach to
                  constructing probabilistic gene regulatory networks},
  author =	 {X Zhou and X Wang and R Pal and I Ivanov and M
                  Bittner and E Dougherty},
  journal =	 {Bioinformatics},
  paged =	 {2918-2927},
  volume =	 20,
  number =	 17,
  year =	 2004,
  abstract =	 {Motivation: We have hypothesized that the
                  construction of transcriptional regulatory networks
                  using a method that optimizes connectivity would
                  lead to regulation consistent with biological
                  expectations. A key expectation is that the
                  hypothetical networks should produce a few, very
                  strong attractors, highly similar to the original
                  observations, mimicking biological state stability
                  and determinism. Another central expectation is
                  that, since it is expected that the biological
                  control is distributed and mutually reinforcing,
                  interpretation of the observations should lead to a
                  very small number of connection schemes. <p>Results:
                  We propose a fully Bayesian approach to constructing
                  probabilistic gene regulatory networks (PGRNs) that
                  emphasizes network topology. The method computes the
                  possible parent sets of each gene, the corresponding
                  predictors and the associated probabilities based on
                  a nonlinear perceptron model, using a reversible
                  jump Markov chain Monte Carlo (MCMC) technique, and
                  an MCMC method is employed to search the network
                  configurations to find those with the highest
                  Bayesian scores to construct the PGRN. The Bayesian
                  method has been used to construct a PGRN based on
                  the observed behavior of a set of genes whose
                  expression patterns vary across a set of melanoma
                  samples exhibiting two very different phenotypes
                  with respect to cell motility and invasiveness. Key
                  biological features have been faithfully reflected
                  in the model. Its steady-state distribution contains
                  attractors that are either identical or very similar
                  to the states observed in the data, and many of the
                  attractors are singletons, which mimics the
                  biological propensity to stably occupy a given
                  state. Most interestingly, the connectivity rules
                  for the most optimal generated networks constituting
                  the PGRN are remarkably similar, as would be
                  expected for a network operating on a distributed
                  basis, with strong interactions between the
                  components. <p>Availability: The appendix is
                  available at
                  http://gspsnap.tamu. edu/gspweb/pgrn/bayes.html. username:
                  gspweb password: gsplab.},
  pdf =		 {bioinformatics/zhou-etal-04.pdf},
  entered_on =	 {02/02/05},
  comments =	 {As far as I can tell, the regulation is modeled by
                  boolean networks, and measurements of steady states
                  of transcriptional networks are steady states of
                  these bollean ones. There is a probability of a
                  switch between boolean networks at every evolution
                  step, resulting in many possible steady states, and
                  also some noise around them. Each boolean network is
                  then learned, their number is learned, probability
                  of switches are learned, all in Bayesian formalism
                  with appropriate BIC-like complexity
                  control. Examples analyzed involve networks of only
                  10 (!) genes. It is seen that the connectivity
                  patterns of the constituent Boolean nets are not
                  very different from each other, and this is somehow
                  related to robustness of transcriptional control,
                  homeostasis, etc -- don't ask me how.},
}













@article{brown-callan-04,
  title =	 {Evolutionary comparisons suggest many novel cAMP
                  response protein binding sites in Escherichia coli},
  author =	 {C. T. Brown and C. G. Callan, Jr.},
  abstract =	 {The cAMP response protein (CRP) is a transcription
                  factor known to regulate many genes in Escherichia
                  coli. Computational studies of transcription factor
                  binding to DNA are usually based on a simple matrix
                  model of sequence-dependent binding energy. For CRP,
                  this model predicts many binding sites that are not
                  known to be functional. If they are indeed spurious,
                  the underlying binding model is called into
                  question. We use a species comparison method to
                  assess the functionality of a population of such
                  predicted CRP sites in E. coli. We compare them with
                  orthologous sites in Salmonella
                  typhimuriumidentified independently by CLUSTALW
                  alignment, and find a dependence of mutation
                  probability on position in the site. This dependence
                  increases with predicted site binding energy. The
                  positions where mutation is most strongly suppressed
                  are those where mutation would have the biggest
                  effect on predicted binding energy. This finding
                  suggests that many of the novel sites are
                  functional, that the matrix model correctly
                  estimates their binding strength, and that
                  calculated CRP binding strength is the quantity that
                  is conserved between species. The analysis also
                  identifies many new E. coli binding sites and genes
                  likely to be functional for CRP.},
  pdf =		 {bioinformatics/brown-callan-03.pdf},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  year =	 2004,
  volume =	 101,
  number =	 8,
  pages =	 {2404--2409},
  comments =	 {},
  entered_on =	 {05/09/04},
}
@article{mwangi-siggia-03,
  title =	 {Genome wide identification of regulatory motifs in
                  Bacillus subtilis},
  author =	 {Michael Mwangi and Eric Siggia},
  abstract =	 {Background: To explain the vastly different
                  phenotypes exhibited by the same organism under
                  different conditions, it is essential that we
                  understand how the organism's genes are coordinately
                  regulated. While there are many excellent tools for
                  predicting sequences encoding proteins or RNA genes,
                  few algorithms exist to predict regulatory sequences
                  on a genome wide scale with no prior
                  information. <p>Results: To identify motifs involved
                  in the control of transcription, an algorithm was
                  developed that searches upstream of operons for
                  improbably frequent dimers. The algorithm was
                  applied to the B. subtilis genome, which is
                  predicted to encode for approximately 200 DNA
                  binding proteins. The dimers found to be
                  over-represented could be clustered into 317
                  distinct groups, each thought to represent a class
                  of motifs uniquely recognized by some transcription
                  factor. For each cluster of dimers, a representative
                  weight matrix was derived and scored over the
                  regions upstream of the operons to predict the sites
                  recognized by the cluster's factor, and a putative
                  regulon of the operons immediately downstream of the
                  sites was inferred. The distribution in number of
                  operons per predicted regulon is comparable to that
                  for well characterized transcription factors. The
                  most highly over-represented dimers matched
                  \sigma^A, the T-box, and \sigma^W sites. We have
                  evidence to suggest that at least 52 of our clusters
                  of dimers represent actual regulatory motifs, based
                  on the groups' weight matrix matches to
                  experimentally characterized sites, the functional
                  similarity of the component operons of the groups'
                  regulons, and the positional biases of the weight
                  matrix matches. All predictions are assigned a
                  significance value, and thresholds are set to avoid
                  false positives. Where possible, we examine our
                  false negatives, drawing examples from known
                  regulatory motifs and regulons inferred from RNA
                  expression data. Conclusions: We have demonstrated
                  that in the case of B. subtilis our algorithm allows
                  for the genome wide identification of regulatory
                  sites. As well as recovering known sites, we predict
                  new sites of yet uncharacterized factors. Results
                  can be viewed at
                  http://www.physics.rockefeller.edu/~mwangi/.},
  journal =	 {BMC Bioinformatics},
  year =	 2003,
  volume =	 4,
  number =	 18,
  pdf =		 {bioinformatics/mwangi-siggia-03.pdf},
  entered_on =	 {05/05/04},
  comments =	 {Good literature collection on searching for
                  overrepresented motifs. Last paragraph page 2 lists
                  detriments of the algorithm. Search for dimers of
                  length 4-5 with separation of 3-30. Significance of
                  dimers is determined by their occurence relative to
                  independent Poissonian occurence. Repeat of the same
                  nucleotides are ignored. A particular ad hoc metric
                  for comparing to dimers is introduced, and dimers
                  are then clustered according to this metric using
                  some particular clustering algorithm they
                  invented. Determining the optimal number of clusters
                  -- they should read on work of Still and Bialek. The
                  point of the work is that motifs =
                  overrepresentation. This should not be this way. On
                  the contrary, motifs can be underrepresented (that
                  is, evolution cleaned up all sequences except the
                  real motifs), or anything else. One should use
                  conserved sites over many species, or something
                  similar to see real motifs. Or maybe just any
                  statistical fluctuation, not necessarily
                  overrepresentation.},
}

@article{bussemaker-etal-00,
  title =	 {Building a dictionary for genomes: Identification of
                  presumptive regulatory sites by statistical
                  analysis},
  author =	 {H Bussemaker and H Li and E Siggia},
  abstract =	 {The availability of complete genome sequences and
                  mRNA expression data for all genes creates new
                  opportunities and challenges for identifying DNA
                  sequence motifs that control gene expression. An
                  algorithm, MobyDick, is presented that
                  decomposes a set of DNA sequences into the most
                  probable dictionary of motifs or words. This method
                  is applicable to any set of DNA sequences: for
                  example, all upstream regions in a genome or all
                  genes expressed under certain
                  conditions. Identification of words is based on a
                  probabilistic segmentation model in which the
                  significance of longer words is deduced from the
                  frequency of shorter ones of various lengths,
                  eliminating the need for a separate set of reference
                  data to define probabilities. We have built a
                  dictionary with 1,200 words for the 6,000
                  upstreamregulatory regions in the yeast genome; the
                  500 most significant words (some with as few as 10
                  copies in all of the upstream regions) match 114 of
                  443 experimentally determined sites (a significance
                  level of 18 standard deviations). When analyzing all
                  of the genes up-regulated during sporulation as a
                  group, we find many motifs in addition to the few
                  previously identified by analyzing the subclusters
                  individually to the expression subclusters. Applying
                  MobyDick to the genes derepressed when the general
                  repressor Tup1 is deleted, we find known as well as
                  putative binding sites for its regulatory partners.},
  pages =	 {10096-10100},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  year =	 2000,
  volume =	 97,
  number =	 18,
  entered_on =	 {08/01/05},
  pdf =		 {bioinformatics/bussemaker-li-siggia-00.pdf},
}


@Article{bussemaker-li-siggia-01,
  author =	 {H. Bussemaker and E. Siggia and H. Li},
  title =	 {Regulatory element detection using correlation with
                  expression},
  journal =	 {Nature Genetics},
  year =	 {2001},
  volume =	 {27},
  pages =	 {167--171},
  entered_on =	 {05/05/04},
  pdf =		 {bioinformatics/bussemaker-li-siggia-01.pdf},
  abstract =	 {We present here a new computational method for
                  discovering cis-regulatory elements that circumvents
                  the need to cluster genes based on their expression
                  profiles. Based on a model in which upstream motifs
                  contribute additively to the log-expression level of
                  a gene, this method requires a single genome-wide
                  set of expression ratios and the upstream sequence
                  for each gene, and outputs statistically significant
                  motifs. Analysis of publicly available expression
                  data for Saccharomyces cerevisiaereveals several new
                  putative regulatory elements, some of which
                  plausibly control the early, transient induction of
                  genes during sporulation. Known motifs generally
                  have high statistical significance.},
  comments =	 {The REDUCE paper. Biggest problem -- additive
                  regulation assumed. Fitting to data is done by
                  linear regression as well.},
}

@misc{eisenberg-levanon-03,
  author =	 {Eli Eisenberg and Erez Levanon},
  title =	 {Human housekeeping genes are compact},
  abstract =	 {We identify a set of 575 human genes that are
                  expressed in all conditions tested in a publicly
                  available database of microarray results. Based on
                  this common occurrence, the set is expected to be
                  rich in "housekeeping" genes, showing constitutive
                  expression in all tissues. We compare selected
                  aspects of their genomic structure with a set of
                  background genes. We find that the introns,
                  untranslated regions and coding sequences of the
                  housekeeping genes are shorter, indicating a
                  selection forcompactness in these genes. },
  url =		 {http://arxiv.org/abs/q-bio/0309020},
  pdf =		 {bioinformatics/eisenberg-levanon-03.pdf},
  entered_on =	 {04/13/04},
  comments =	 {The discussion regarding how transcription is costly
                  and, therefore, commonly expressed genes (like
                  housekeeping genes) should have short sequences is
                  interesting, but I find this principle hard to
                  believe: first, you need to make the aminoacid
                  sequence that will be able to do the job, and only
                  then may you care about the length of such
                  sequence.},
  howpublished = {arXiv: q-bio/0309020},
}

@misc{apostol-etal-03,
  title =	 {How Predictable Are Biological Sequences?},
  author =	 {Izydor Apostol and Philippe Jacquet and Wojciech
                  Szpankowski},
  year =	 2003,
  postscript =	 {bioinformatics/apostol-etal-03.ps.gz},
  howpublished = {European Conf. Comput. Biol., 2003},
  entered_on =	 {04/13/04},
  comments =	 {Application of the "Sampled Pattern Matching" (SPM)
                  prediction algorithm to biological data. The
                  algorithm is based upon building the pdf of the next
                  symbol conditional on the suffix of the length which
                  is a finite fraction of the longest suffix that has
                  appeared, at least, twice in the preceeding
                  sequence. The thorem that is alluded to in the text
                  is based on the Markovian assumption about the
                  source, which is not necessarily a biologically
                  relevant assumption. The paper mentiones that the
                  predictability, as measured by the error of the SPM
                  algorithm is different for conding vs. non-coding
                  regions in DNA, which is a nice, but not a novel
                  result.}
}

@inproceedings{lonardi-szpankowski-03,
  title =	 {Joint Source--Channel LZ'77 Coding},
  author =	 {Stefano Lonardi and Wojciech Szpankowski},
  booktitle =	 {Proc. Data Compr. Conf. 2003},
  year =	 2003,
  publisher =	 {IEEE Computer Society Press},
  pdf =		 {entropy/lonardi-szpankowski-03.pdf},
  entered_on =	 {04/13/04},
  abstract =	 {Limited memory and bounded communication resources
                  require powerful data compression techniques, but at
                  the same time noise tetherless channels and/or
                  corrupted file systems need error correction
                  capabilities. Joint source-channel coding has
                  emerged as a viable solution to this problem. We
                  present here the first practical joint
                  source-channel coding algorithm capable of
                  correcting errors in the popular Lempel-Ziv'77
                  scheme without practically loosing any compression
                  power. This is possible since the LZ'77 encoder does
                  not completely remove all redundancy. The inherent
                  additional redundancy left by the LZ'77 encoder is
                  used succinctly by a channel coder (e.g.,
                  Reed-Solomon coder) to protect against limited
                  number of errors. In addition to this, the scheme
                  proposed here is perfectly backward0compatible, that
                  is, a file compressed with our error resilient LZ-77
                  can be still decompressed by a common LZ'77
                  decoder. In this preliminary report, we present our
                  algorithm, collect some experimental data supporting
                  our claims, and provide some thoretical
                  justifications.},
  comments =	 {The error correction is based upon the fact that
                  the longest prefix in LZ'77 code usually appears
                  more than once in the preceeding parsed text. One
                  usually chooses a random (first) of this repeats and
                  points to it. Instead, one may point at the repeat
                  with number coded by the bits of some text sequence,
                  which may be, for example, the parity sequence for
                  the original text, etc. For Markov processes it's
                  proven that there will be enough of such redundancy
                  to be able to do error correction. Some cute
                  numerical experiments.}
}

@techreport{barash-friedman-02,
  institution =	 {Hebrew University, CS, Leibnitz center},
  number =	 {2002-05},
  author =	 {Yoseph Barash and Nir Friedman},
  title =	 {Context-Specific Bayesian Clustering for Gene
                  Expression Data},
  year =	 2002,
  url =
                  {http://leibniz.cs.huji.ac.il/tr/acc/2002/HUJI-CSE-LTR-2002-5_BF1Full.pdf},
  pdf =		 {bioinformatics/barash-friedman-02.pdf},
  abstract =	 {The recent growth in genomic data and measurements
                  of genome-wide expression patterns allows us to
                  apply computational tools to examine gene regulation
                  by transcription factors. In this work, we present a
                  class of mathematical models that help in
                  understanding the connections between transcription
                  factors and functional classes of genes based on
                  genetic and genomic data. Such a model represents
                  the joint distribution of transcription factor
                  binding sites and of expression levels of a gene in
                  a unified probabilistic model. Learning a combined
                  probability model of binding sites and expression
                  patterns enables us to improve the clustering of the
                  genes based on the discovery of putative binding
                  sites and to detect which binding sites and
                  experiments best characterize a cluster. To learn
                  such models from data, we introduce a new search
                  method that rapidly learns a model according to a
                  Bayesian score. We evaluate our method on synthetic
                  data as well as on real life data and analyze the
                  biological insights it provides. Finally, we
                  demonstrate the applicability of the method to other
                  data analysis problems in gene expression data. },
  entered_on =	 {02/20/04},
  comments =	 {A good primer on the method, superceedes previous
                  work by the authors on the subject ("default tables"
                  for description of contexts, structural "EM"
                  algorithm). I cannot agree with their claim that:
                  The main biological hypothesis underlying most of
                  these analyses is "Genes with a common functional
                  role have similar expression patterns across
                  different experiments." Good reference
                  section. Here's the summary of what they do: "Our
                  method clusters genes with similar expression
                  patterns and promoter regions . In addition, the
                  learned model provides insight on the regulation of
                  genes within each cluster. The key features of our
                  approach are: (1) automatic detection of the number
                  of clusters; (2) automatic detection of random
                  variables that are irrelevant to the clusters; (3)
                  robust clustering in the presence of many such
                  random variables, (4) context-depended
                  representation that describes which clusters each
                  attribute depends on."},
}

@TechReport{murhy-mian-99,
  author =	 {Kevin Murphy and Saira Mian},
  title =	 {Modelling Gene Expression Data using Dynamic
                  Bayesian Networks},
  institution =	 {Berkeley, CS Dept.},
  year =	 {1999},
  entered_n =	 {11/04/03},
  pdf =		 {bioinformatics/murphy-mian-99.pdf},
  abstract =	 {Recently, there has been much interest in reverse
                  engineering genetic networks from time series
                  data. In this paper, we show that most of the
                  proposed discrete time models -- including the
                  boolean network model [Kau93, SS96], the linear
                  model of Dohaeseleer et al. [DWFS99], and the
                  nonlinear model of Weaver et al. [WWS99] -- are all
                  special cases of a general class of models called
                  Dynamic Bayesian Networks (DBNs). The advantages of
                  DBNs include the ability to model stochasticity, to
                  incorporate prior knowledge, and to handle hidden
                  variables and missing data in a principled way. This
                  paper provides a review of techniques for learning
                  DBNs. Keywords: Genetic networks, boolean networks,
                  Bayesian networks, neural networks, reverse
                  engineering, machine learning.},
  comments =	 {Simple non-technical description of BN's and
                  DBN's. Nothing in terms of new results.},
}

@misc{nimwegen-03,
  howpublished = {E-print},
  title =	 {Scaling laws in the functional content of genomes},
  author =	 {Erik van Nimwegen},
  year =	 2003,
  abstract =	 {With the number of sequenced genomes now over one
                  hundred, and the availability of rough functional
                  annotations for a substantial proportion of their
                  genes, it has become possible to study the
                  statistics of gene content across genomes. Here I
                  show that, for many high-level functional
                  categories, the number of genes in the category
                  scales as a power-law in the total number of genes
                  in the genome. The occurrence of such scaling laws
                  can be explained with a simple theoretical model,
                  and this model suggests that the exponents of the
                  observed scaling laws correspond to universal
                  constants of the evolutionary process. I discuss
                  some consequences of these scaling laws for our
                  understanding of organism design. },
  url =		 {http://arxiv.org/abs/physics/0307001},
  pdf =		 {bioinformatics/nimwegen-03.pdf},
  entered_on =	 {09/18/03},
  comments =	 {The paper reviews scalings of the number of genes in
                  a given functional units versus the total number of
                  genes in the organism, as given by some (standard)
                  databases. The dependence is, apparently, a power
                  law for many different classes of genes and
                  different types of animals. There are, obviously,
                  many things that can go wrong with such analysis (is
                  it really a power law? what's the dependence on the
                  database? on assignment to functional units? etc.),
                  but Erik seems to address most of such questions
                  very reasonably. The simple model proposed to
                  explain the results is probably too simplistic.},
}

@article{ebeling-jimenezmontano-80,
  author =	 {W. Ebeling and M. A. Jimenez-Montano},
  title =	 {On grammars, complexity and information measures of
                  biological macromolecules},
  journal =	 {Math. Biosci.},
  volume =	 52,
  year =	 1980,
  pages =	 {53--71},
  keywords =	 {TO_GET},
  comments =	 {Should contain some references to the
                  Thiele-Schreidereiter grammar complexity
                  compression. Also see
                  \cite{jimenezmontano-etal-02,grassberger-02} for
                  subsequent developments.},
}

@article{grosse-etal-00,
  title =	 {Species Independence of Mutual Information in Coding
                  and Noncoding DNA},
  author =	 {Ivo Grosse and Hanspeter Herzel and Sergey
                  V. Buldyrev and H. Eugene Stanley},
  journal =	 {Physical Review E},
  volume =	 61,
  number =	 5,
  pages =	 {5624--5629},
  month =	 {May},
  year =	 2000,
  abstract =	 {We explore if there exist universal statistical
                  patterns that are different in coding and noncoding
                  DNA and can be found in all living organisms,
                  regardless of their phylogenetic origin. We find
                  that (i) the mutual information function has a
                  significantly different functional form in coding
                  and noncoding DNA. We further find that (ii) the
                  probability distributions of the average mutual
                  information I are significantly different in coding
                  and noncoding DNA, while (iii) they are almost the
                  same for organisms of all taxonomic
                  classes. Surprisingly, we find that I is capable of
                  predicting coding regions as accurately as
                  organism-specific coding measures.},
  keywords =	 {TO_GET},
  entered_on =	 {08/21/2001}
} 

@article{herzel-etal-98,
  author =	 {H. Herzel and E. N. Trifonov and O. Weiss and
                  I. Grosse},
  title =	 {Interpreting Correlations in Biosequences},
  journal =	 {Physica A},
  volume =	 249,
  pages =	 {449--459},
  year =	 1998,
  entered_on =	 {08/21/2001},
  keywords =	 {TO_GET},
} 

@article{herzel-grosse-95,
  author =	 {H. Herzel and I. Grosse},
  title =	 {Measuring correlations in symbol sequences},
  journal =	 {Physica A},
  volume =	 216,
  number =	 4,
  pages =	 {518--542},
  month =	 {July},
  year =	 1995,
  abstract =	 {The paper is devoted to relations between
                  correlation functions and mutual information, it is
                  shown that, in the sequences over an alphabet of
                  lambda symbols, statistical dependences are measured
                  by (lambda-1)/2 independent parameters. However, not
                  all of them can be determined by autocorrelation
                  functions. Appropriate sets of correlation functions
                  (including crosscorrelations) are introduced, which
                  allow the detection of all dependences. The results
                  are exemplified fir binary, ternary, and quaternary
                  symbol sequences. As an application, it is discussed
                  that a nonuniform codon usage in protein-coding DNA
                  sequences introduces periodic correlations even at
                  distances in the order of 1000 base pairs},
  keywords =	 {TO_GET, should have a xerox},
} 

@article{herzel-grosse-97,
  author =	 {H. Herzel and I. Grosse},
  title =	 {Correlations in DNA Sequences - the Role of Protein
                  Coding Segments},
  journal =	 {Phys. Rev. E},
  volume =	 55,
  pages =	 {800-810},
  year =	 1997,
  entered_on =	 {08/21/2001},
  keywords =	 {TO_READ},
  url =		 {bioinformatics/herzel_grosse_97.pdf},
}

@article{li-kaneko-92,
  author =	 {W. Li and K. Kaneko},
  title =	 {Long-range correlations and partial 1/f spectrum in
                  noncoding DNA sequence},
  journal =	 {Europhys. Lett.},
  volume =	 17,
  year =	 1992,
  pages =	 {655--661},
  keywords =	 {TO_GET},
}

@article{schmitt-herzel-97-jtb,
  author =	 {A. O. Schmitt and H. Herzel},
  title =	 {Estimating the Entropy of DNA Sequences},
  journal =	 {J. theor. Biol.},
  volume =	 188,
  pages =	 {369--377},
  year =	 1997,
  entered_on =	 {08/21/2001},
  keywords =	 {TO_GET},
}

@article{stanley-etal-94,
  author =	 {H.E. Stanley and others},
  title =	 {Statistical mechanics in biology: how ubiquitous are
                  long-range correlations?},
  journal =	 {Physica A},
  volume =	 205,
  year =	 1994,
  pages =	 {214--241},
  comments =	 {TO_GET}
}

@article{weiss-herzel-98,
  author =	 {O. Weiss and H. Herzel},
  title =	 {Measuring Correlations in Protein Sequences},
  journal =	 {Z. Phys. Chem.},
  volume =	 204,
  pages =	 {183-197},
  year =	 1998,
  entered_on =	 {08/21/2001},
  keywords =	 {TO_GET},
}

@article{weiss-herzel-98-jtb,
  author =	 {O. Weiss and H. Herzel},
  title =	 {Correlations in Protein Sequences and Property
                  Codes},
  journal =	 {J. theor. Biol.},
  volume =	 190,
  pages =	 {341--353},
  year =	 1998,
  entered_on =	 {08/21/2001},
  keywords =	 {TO_GET},
}

@article{weiss-jimenezmontano-herzel-00,
  author =	 {O. Weiss and M. Jimenez-Montano and H. Herzel},
  title =	 {Information content of protein sequences},
  journal =	 {J. theor. Biol.},
  volume =	 206,
  pages =	 {379--386},
  year =	 2000,
  entered_on =	 {08/21/2001},
  keywords =	 {TO_GET},
}@article{drew-etal-05,
  title  =       {Temporal Control of Conditioned Responding in Goldfish},
  author =       {M Drew and B Zupan and A Cooke and P Couvillon and P
                  Balsam},
  abstract =     {The peak procedure was used to characterize response
                  timing during acquisition and maintenance of 
                  conditioned responding in goldfish. Subjects
                  received light-shock pairings with a 5- or 15-s
                  interstimulus interval. On interspersed peak trials,
                  the conditioned stimulus light was presented for 45
                  s and no shock was delivered. Peaks in the
                  conditioned response, general activity, occurred at
                  about the time of the expected unconditioned
                  stimulus, and variability in the activity
                  distribution was scalar. Modeling of the changes in
                  the activity distributions over sessions revealed
                  that the temporal features of the conditioned
                  response changed very little during acquisition. The
                  data suggest that times are learned early in
                  training,  and, contrary to I. P. Pavlov’s
                  (1927/1960) concept of “inhibition of delay,” that
                  timing is learning when to respond rather than
                  learning when not to respond. }, 
  journal =      {J Exper. Psychology: Animal Behav. Proc.},
  year =         2005,
  volume =       31, 
  number =       1,
  pages =        {31–39},
  pdf =          {bio-learning/drew-etal-05.pdf},
  comments =     {Nice set of references to timing of the CR after the
                  onset of the CS in vertebrates; this specific paper
                  examines the goldfish.},
}

@article{balsam-gibbon-88,
  author =	 {P Balsam and J Gibbon}, 
  year =         1988,
  title =        {Formation of tone–US associations 
                  does not interfere with the formation of context–US
                  associations in pigeons},
  journal =      {J. Experim. Psychology: Animal Behav. Proc.},
  volume =       14,
  pages =        {401–-412},
  pdf =          {bio-learning/balsam-gibbon-88.pdf},
  abstract =     {In four experiments we investigated whether signaled
                  and unsignaled US presentations resulted in
                  differential context conditioning. Experiments 1 and
                  2 showed that the presence of a tone during grain
                  presentation facilitated the formation of tone-food
                  associations in pigeons. Experiment 2 also showed
                  that the acquisition of associative value by the
                  tone did not diminish associations between context
                  and the unconditioned stimulus (US). Experiment 3
                  showed that signaled USs did not interfere with the
                  acquisition of context-US associations, and
                  Experiment 4 showed that even when the signal was
                  extensively pretrained, context-US associations
                  could not be blocked. The results of these
                  experiments are inconsistent with conditioning
                  models that require competition between cues and
                  contexts for associative value.},
  comments =     {Comparison of association to background and to
                  CS. Are they independent? Can backgroundn be
                  overshadowed? Is background just another CS?
                  Comparison to Scalar Expectancy theory and to
                  Rescorla-Wagner. Acquisition is dramatically faster
                  if the context is extinguished prior to
                  autoshaping. Need to read more thoroughly.}
}


@article{grossberg-82,
  author =       {S Grossberg},
  year =         1982,
  title =        {Processing of expected and unexpected events during 
                  conditioning and attention: A psychophysiological
                  theory},
  journal =      {Psychological Rev.}, 
  volume =       89,
  pages =        {529–572},
  comments =     {Studies competition between different CS's to
                  predict a US.},
}

@incollection{jenkins-etal-81,
  author =       {H Jenkins and R Barnes and F Barrera},
  year =         1981,
  title =        {Why autoshaping depends on trial spacing},
  editor =       {C Locurto and H Terrace and J Gibbon},
  booktitle =    {Autoshaping and conditioning theory},
  pages =        {255–284},
  address =      {New York},
  publisher =    {Academic Press},
  comments =     {Fast (about 30 rewards) acquisition time for the
                  CR.}
}


@article{gallistel-gibbon-00,
  author =	 {CR Gallistel and J Gibbon},
  year =	 2000,
  title =	 {Time, rate and conditioning},
  journal =	 {Psychological Rev.},
  volume =	 107,
  pages =	 {289--344},
  entered_on =   {01/03/06},
  pdf =          {bio-learning/gallistel-gibbon-00.pdf},
  abstract =     {We draw together and develop previous timing models
                  for a broad range of conditioning phenomena to
                  reveal their common conceptual foundations: First,
                  conditioning depends on the learning of the temporal
                  intervals between events and the reciprocals of
                  these intervals, the rates of event
                  occurences. Second, remembered intervals and rates
                  translate into observed behavior through decision
                  processes whose structure is adapted to noise in the
                  decision variables. The noise and the uncertainties
                  consequent upon it have both subjective and
                  objective origins. A third feature of these models
                  is their time-scale invariance, which we argue is a
                  deeply important property evident in the available
                  experimental data. This conceptual framework is
                  similar ro rhe psychophysical conceptual framework
                  in which contemporary models of sensory processinf
                  are rooted. We contrast it with the associative
                  conceptual framework.},
  comments =      {Review of the scalar expectancy theory and the rate
                  estimation theory. Some interesting points (see also
                  \cite{kakade-dayan-02} for summary of some): time to
                  acquisition of CR depends on the number of
                  reinforcers and does not depend on the fraction of
                  reinforced CS. Note that RET for qcquisition of
                  conditioned response would suggest excessively large
                  thresholds \beta, as argued in
                  \cite{kakade-dayan-02}. Review of all major
                  conditioning paradigms.},
}


@incollection{dayan-01,
  author =	 {P Dayan},
  year =         2001,
  title =        {Reinforcement learning},
  editor =       {CR Gallistel},
  booktitle =        {Steven's Handbook of Experimental Psychology},
  address =      {New York, NY},
  publisher =    {Wiley},
  pdf =          {bio-learning/dayan-01.pdf},
}



@inproceedings{dayan-long-98,
  title =	 {Statistical models of conditioning},
  author =	 {P Dayan and T Long},
  year =	 1998,
  abstract =	 {Conditioning experiments probe the ways that animals
                  make predictions about rewards and punishments and
                  use those predictions to control their behavior. One
                  standard model of conditioning paradigms which
                  involve many conditioned stimuli suggests that
                  individual predictions should be added
                  together. Various key results show that this model
                  fails in some circumstances, and motivate an
                  alternative model, in which there is attentional
                  selection between different available stimuli. The
                  new model is a form of mixture of experts, has a
                  close relationship with some other existing
                  psychological suggestions, and is statistically
                  well-founded. },
  booktitle =	 {Adv. Neural Inf. Proc. Syst. 10},
  publisher =	 {MIT Press},
  pdf=           {bio-learning/dayan-long-98.pdf},
  comments =     {The paper has all the usual problems of the paper
                  where the exprerimentalist does not agree with the
                  animal on the priors. They review different
                  conditioning experiments, in particularly focusing
                  on the downwards unblocking, which seems to be
                  unexplainable by the standard
                  \cite{rescorla-wagner-72} US-processing theory, that is,
                  prediction-discrepancy-reinforcement type of
                  learning modeling. They present a different model
                  where each of the CSs acts as an independent expert
                  in predicting a US, and the cooperative mixture of
                  experts is used by the animals to make its
                  predictions. References I need to read at Konorski,
                  1967, Grossberg, 1982, Solomon and Corbit 1974
                  ("opponency").  }
}

@article{kakade-dayan-02,
  title =	 {Acquisition and Extinction in Autoshaping},
  author =	 {Sham Kakade and Peter Dayan},
  abstract =	 {C. R. Gallistel and J. Gibbon (2000) presented
                  quantitative data on the speed with which animals
                  acquire behavioral responses during autoshaping,
                  together with a statistical model of learning
                  intended to account for them. Although this model
                  captures the form of the dependencies among critical
                  variables, its detailed predictions are
                  substantially at variance with the data. In the
                  present article, further key data on the speed of
                  acquisition are used to motivate an alternative
                  model of learning, in which animals can be
                  interpreted as paying different amounts of attention
                  to stimuli according to estimates of their
                  differential reliabilities as predictors.},
  pdf =		 {bio-learning/kakade-dayan-02.pdf},
  journal =	 {Psych. Rev.},
  year =	 2002,
  volume =	 109,
  number =	 3,
  pages =	 {533--544},

  comments =	 {Starts with references to earlier papers by Gibbons,
                  Gallistel, Balsam, etc. Reviews data that time to CR
                  acquisition is proportional to CS duration (that is,
                  inversely proportional to US rate during CS
                  presence), and inverseley proportional to the
                  intertrial distance (that is, proportional to the
                  conservative, Laplace, estimate of the background US
                  rate). Further, the number of US presentations (not
                  of trial in partial reinforcement scenarios)
                  determines time to acquisition. They show, however,
                  that simple rate estimation produces inconsistencies
                  with data, since the learning seems to be much
                  slower in the zero background rate case when the
                  background US rate is present (fig 2). Further (fig
                  3 and related discsussion) there is a problem of
                  attributing the USs to the background or to the
                  CS's, especially if some prior exposure to US has
                  been arranged (without a CS). In this case, the
                  background rate should depend on the delivery rate
                  of US in prior non-CS regions, an so should the time
                  to acquisition. They do not; and the dependence is
                  only on the number of prior non-CS related USs. The
                  authors introduce the ``windowing'' model for
                  estimation of the background US rate and the US rate
                  during the CS. Of course, then one must agree with
                  the animal how the window length should be set; they
                  do not really address this issue (a more general
                  Kalman-filter-like model introduced in the Appendix
                  suffers from the same problem). Then each expert
                  (CS, background, etc.) has its own estimate of the
                  US rate, and the overall animal's expectation is a
                  weighted sum of these predictions, with the weights
                  proportional to the reliability of each individual
                  expert; the experts may thus, for example, block
                  each other. No law for the evolution of
                  reliabilities is given. With some empirical laws, it
                  might be possible to resolve the problem of varying
                  acquisition rates (see above). But is this really an
                  improvement -- we have produced a better fit by
                  merely introducing yet another free, undetermined,
                  function -- the reliability. In general, the article
                  focuses on testing certain laws, algorithms for
                  modeling animal behavior instead of studying the
                  problem that is being solved by the animal in some
                  general, method-independent, way. It's difficult to
                  model learning when you don't know what the animal's
                  priors are.},
}

@inproceedings{kakade-dayan-00,
  booktitle =	 {Adv. Neural. Inf. Proc. Syst. (NIPS) 12},
  year =	 2000,
  author =	 {Sham Kakade and Peter Dayan},
  title =	 {Acquisition in autoshaping},
  abstract =	 {Quantitative data on the speed with which animals
                  acquire behavioral responses during classical
                  conditioning experiments should provide strong
                  constraints on models of learning. However, most
                  models have simply ignored these data; the few that
                  have attempted to address them have failed by at
                  least an order of magnitude. We discuss key data on
                  the speed of acquisition, and show howto account for
                  themusing a statistically sound model of learning,
                  in which differential reliabilities of stimuli play
                  a crucial role.},
  pdf =		 {bio-learning/kakade-dayan-00.pdf},
  comments =	 {Preliminary version of \cite{kakade-dayan-02}.},
}
@incollection{bialek-02,
  booktitle =	 {Physics of bio-molecules and cells: Les Houches,
                  Session LXXV, 2-27 July 2001},
  editor =	 {H Flyvbjerg and F Julicher and P Ormos and F David},
  publisher =	 {EDP Sciences, Springer},
  address =	 {Les Ulis, Berlin},
  year =	 2002,
  author =	 {W Bialek},
  title =	 {Thinking about the brain},
  pages =	 {486--577},
  pdf =		 {bio-learning/bialek-02.pdf},
  entered_on =	 {07/07/05},
  abstract =	 {We all are fascinated by the phenomena of
                  intelligent behavior, as generated both by our own
                  brains and by the brains of other animals. As
                  physicists we would like to understand if there are
                  some general principles that govern the structure
                  and dynamics of the neural circuits that underlie
                  these phenomena. At the molecular level there is an
                  extraordinary universality, but these mechanisms are
                  surprisingly complex. This raises the question of
                  how the brain selects from these diverse mechanisms
                  and adapts to compute "the right thing" in each
                  context. One approach is to ask what problems the
                  brain really solves. There are several examples -
                  from the ability of the visual system to count
                  photons on a dark night to our gestalt recognition
                  of statistical tendencies toward symmetry in random
                  patterns - where the performance of the system in
                  fact approaches some fundamental physical or
                  statistical limits. This suggests that some sort of
                  optimization principles may be at work, and there
                  are examples where these principles have been
                  formulated clearly and generated predictions which
                  are confirmed in new experiments; a central theme in
                  this work is the matching of the coding and
                  computational strategies of the brain to the
                  statistical structure of the world around
                  us. Extension of these principles to the problem of
                  learning leads us into interesting theoretical
                  questions about how to measure the complexity of the
                  data from which we learn and the complexity of the
                  models that we use in learning, as well as opening
                  some new opportunities for experiment. This
                  combination of theoretical and experimental work
                  gives us some new (if still speculative)
                  perspectives on classical problems and controversies
                  in cognition.},
  url =		 {http://arxiv.org/abs/physics/0205030},
}


@article{gallistel-etal-04,
  title =	 {The learning curve: Implications of a quantitative
                  analysis},
  author =	 {CR Gallistel and S Fairhurst and P Balsam},
  abstract =	 {The negatively accelerated, gradually increasing
                  learning curve is an artifact of group averaging in
                  several commonly used basic learning paradigms
                  (pigeon autoshaping, delay- and trace-eyeblink
                  conditioning in the rabbit and rat, autoshaped
                  hopper entry in the rat, plus maze performance in
                  the rat, and water maze performance in the
                  mouse). The learning curves for individual subjects
                  show an abrupt, often step-like increase from the
                  untrained level of responding to the level seen in
                  the well trained subject. The rise is at least as
                  abrupt as that commonly seen in psychometric
                  functions in stimulus detection experiments. It may
                  indicate that the appearance of conditioned behavior
                  is mediated by an evidence-based decision process,
                  as in stimulus detection experiments. If the
                  appearance of conditioned behavior is taken instead
                  to reflect the increase in an underlying associative
                  strength, then a negligible portion of the function
                  relating associative strength to amount of
                  experience is behaviorally visible. Consequently,
                  rate of learning cannot be estimated from the
                  group-average curve; the best measure is latency to
                  the onset of responding, determined for each subject
                  individually.},
  journal =	 {Proc. Natl. Acad. Sci. (USA)},
  year =	 2004,
  volume =	 101,
  number =	 36,
  pages =	 {13124--13131},
  pdf =		 {bio-learning/gallistel-etal-04.pdf},
  entered_on =	 {07/07/05},
  comments =	 {The paper reviews a series of various conditioning
                  experiments and shows that in all cases learning is
                  almost instantaneous on a single individual level
                  (while smooth, if averaged over populations). This
                  resembles many effects in other fields of
                  literature, such as \cite{cluzel-etal-00}. Of
                  course, in this work the authors talk about
                  acquisition of conditioning behavior (that is,
                  making the animals understand that there is a very
                  strong CS-US association), rather then learning a
                  value of the association from some set. It's unclear
                  what the priors are in this case, thus it is unclear
                  how should learning proceed. Further, behavior is
                  measured by some summary statistics. While this
                  statistics changes abruptly, other aspects may be
                  evolving slower. Thirdly, in psychophysics signal
                  detection experiments, one notices that decisions
                  come abruptly if you force them to. Allowing for
                  probabilistic choice even in detection tasks makes
                  learning smooth (no "rounding" effect).},
}

@article{knudsen-02,
  author =	 {E Knudsen},
  year =	 2002,
  journal =	 {Nature},
  volume =	 417,
  pages =	 {328--328},
  title =	 {Instructed learning in the auditory localization
                  pathway of the barn owl},
}


@article{miller-56,
  title =	 {The Magical Number Seven, Plus or Minus Two: Some
                  Limits on Our Capacity for Processing Information},
  author =	 {George A. Miller},
  journal =	 {Psych. Rev.},
  year =	 1956,
  volume =	 63,
  pages =	 {81--97},
  pdf =		 {bio-learning/miller-56.pdf},
  comments =	 {Classical paper suggesting that in many conditions
                  people can discriminate about 7 possibilities only.},
  entered_on =	 {06/29/05},
}

@article{tversky-kahneman-81,
  journal =	 {Science},
  year =	 1981,
  volume =	 211,
  pages =	 {453-458},
  author =	 {A Tversky and D Kahneman},
  title =	 {The framing of decisions and the psychology of
                  choice},
  abstract =	 {The psychological principles that govern the
                  perception of decision problems and the evaluation
                  of probabilities and outcomes produce predictable
                  shifts of preference when the same problem is framed
                  in different ways. Reversals of preference are
                  demonstrated in choices regarding monetary outcomes,
                  both hypothetical and real, and in questions
                  pertaining to the loss of human lives. The effects
                  of frames on preferences are compared to the effects
                  of perspectives on perceptual appearance. The
                  dependence of preferences on the formulation of
                  decision problems is a significant concern for the
                  theory of rational choice.},
  entered_on =	 {06/30/05},
  pdf =		 {bio-learning/tversky-kahneman-81.pdf},
  comments =	 {A classic paper about "irrational" decision making
                  by humans. They propose the "prospects" theory
                  instead of the "utility" theory. The hallmarks of
                  the theory are: larger (negative) utility for loss
                  than the (positive) utility for similar
                  gain. Convexity of the utility (the value of epsilon
                  on top of a large number is less than the value of
                  epsilon alone). Additionally, when averaging
                  potential income, people weigh small probabilities
                  disproportionally highly (could it be because small
                  probabilities come with respectively larger error
                  bars?). With such nonlinear cost and weighting, it
                  becomes important, which level is chosen as the
                  status-quo (zero loss/gain), and whether multiple
                  choices are considered independent, or as a part of
                  a one big decisions.},
}


@article{rescorla-00,
  title =	 {Associative changes with a random CS-US
                  relationship},
  author =	 {R Rescorla},
  abstract =	 {Three experiments examined conditioned magazine
                  approach in rats when a positive unconditioned
                  stimulus (US) bore a random relation to a
                  conditioned stimulus (CS). Experiment 1 found that
                  over the course of conditioning the CS initially
                  elevated responding relative to the baseline but
                  then lost the power to do so. Transfer tests
                  revealed that a CS-US association developed early
                  and persisted despite the decline in magazine
                  responding. Experiment 2 confirmed the persistence
                  of CS-US associations and found them to be more
                  substantial when a different US occurred during the
                  CS than in its absence. In Experiment 3, when the
                  situation was exposed to US alone presentations
                  prior to introducing the CS, there was little
                  evidence that a subsequent random relation between
                  the CS and US produced an association between
                  them. These results agree with those of blocking and
                  overshadowing experiments using discrete CSs and
                  support an interpretation of the random procedure in
                  terms of competition between the background and CS
                  for conditioning.},
  journal =	 {The Quarterly J. Exp. Psych.},
  year =	 2000,
  volume =	 {53B},
  number =	 4,
  pages =	 {325-340},
  pdf =		 {bio-learning/rescorla-2000.pdf},
  comments =	 {The experiments have quite complicated
                  contingencies. I would not be able to follow them;
                  it's suprizing that the rats do what they do. Notice
                  that the rat at the start of the experiment are
                  quite naive. Thus increases of responses to CS which
                  is only randomly related to US might be due to the
                  rat just figuring out the relation, not knowing
                  precisely what to expect, and clearly seeing that
                  the food is more plentiful when the CS is around
                  compared to all the time during the day when it is
                  not in the experiment. I think a trained rat at the
                  beginning of the experiment might be better aswe
                  know that it is not learning about the environment,
                  about statistics of the food delivery, and all that
                  kind of crap, but only about CS-US
                  association. Further, the paper makes a clear
                  distinction between the acquisition of the behavior
                  and extinction of the behavior. The transfer method
                  for measuring associations is cute. However, it
                  measures something about the animal's behavior
                  (which involves animal's desires, moods, etc.) which
                  is not the same as measuring that the animal has
                  learned the association. We need to look for changes
                  in the behavior (any changes!) as a function of time
                  to analyze the latter. There are also some possible
                  problems with the experiment not being truly random
                  (top of page 329).}
}

@article{fusi-etal-05,
  author =	 {S Fusi and P Drew and L Abbott},
  journal =	 {Neuron},
  volume =	 45,
  pages =	 {599-611},
  year =	 2005,
  title =	 {Cascade Models of Synaptically Stored Memories},
  abstract =	 {Storing memories of ongoing, everyday experiences
                  requires a high degree of plasticity, but retaining
                  these memories demands protection against changes
                  induced by further activity and experience. Models
                  in which memories are stored through switch-like
                  transitions in synaptic efficacy are good at storing
                  but are bad at retaining memories if these
                  transitions are likely, and they are poor at storage
                  but good at retention if they are unlikely. We
                  construct and study a model in which each synapse
                  has a cascade of states with different levels of
                  plasticity, connected by metaplastic
                  transitions. This cascade model combines high levels
                  of memory storage with long retention times and
                  significantly outperforms alternative models. As a
                  result, we suggest that memory storage requires
                  synapses with multiple states exhibiting dynamics
                  over a wide range of timescales, and we suggest
                  experimental tests of this hypothesis.},
  pdf =		 {bio-learning/fusi-etal-05.pdf},
  comments =	 {An explicit model that generates power law
                  forgetting from exponential transitions with rates
                  spread over many orders of magnitude. Interesting,
                  but to some extent pretty obvious a posteriori.},
}

@article{earn-johnstone-97,
  title =	 {A systematic error in tests of ideal free theory},
  author =	 {D Earn and R Johnstone},
  abstract =	 {Classical ideal free theory predicts that the
                  distribution of consumers within a patchy
                  environment should correspond to the distribution of
                  resources. Tests of these predictions have
                  inappropriately compared ratios of mean resource
                  levels and mean consumer densities, rather than
                  means of ratios. We show that this error, which has
                  propagated through hundreds of studies, leads to a
                  systematic bias: the theory will appear to
                  underestimate the number of consumers occupying poor
                  patches. We explain the correct way to test the
                  ideal free theory and apply it to published data;
                  the classical model is then seen to yield far more
                  accurate predictions than previously thought.},
  journal =	 {Proc. Roy. Soc. Lond. B},
  year =	 1997,
  volume =	 264,
  pages =	 {1671--1675},
  pdf =		 {bio-learning/earn_johnstone_97.pdf},
  entered_on =	 {01/19/05},
  comments =	 {I think that Appendix 1 (derivation of crucial
                  results) is simply wrong. The results may still hold
                  though.},
}

@article{balkovsky-shraiman-02,
  journal =	 {Proc. natl. Acad. Sci},
  year =	 2002,
  volume =	 99,
  number =	 20,
  pages =	 {12589--12593 },
  title =	 { Olfactory search at high Reynolds number},
  author =	 { E Balkovsky and B Shraiman},
  abstract =	 { Locating the source of odor in a turbulent
                  environmenta common behavior for living organismsis
                  nontrivial because of the random nature of
                  mixing. Here we analyze the statistical physics
                  aspects of the problem and propose an efficient
                  strategy for olfactory search that can work in
                  turbulent plumes. The algorithm combines the maximum
                  likelihood inference of the source position with an
                  active search. Our approach provides the theoretical
                  basis for the design of olfactory robots and the
                  quantitative tools for the analysis of the observed
                  olfactory search behavior of living creatures (e.g.,
                  odor-modulated optomotor anemotaxis of moths).},
  entered_on =	 {07/21/04},
  pdf =		 {bio-learning/balkovsky-shraiman-02.pdf},
  comments =	 {Smell propagates in the wind in terms of long lived
                  localized smell patches. While the algorithm they
                  produce is most probably OK, the derivation is
                  lacking. They claim that each arriving patch reduces
                  the uncertainty about the source, and the best
                  strategy is to move in the direction of the
                  patch. Why? A better way to formulate the problem
                  would be as follows. Write down the entropy of the
                  source location, and choose the trajectories that
                  minimize such entropy (maximum MI between source and
                  next step). This will have to be aveaged over random
                  plume positions. Notice that this will take into the
                  account the fact that maximum-likelihood like
                  methods wont work -- if by mistake you exit the
                  plume cone, you will never find the source.},
}

@article{brenner-00,
  journal =	 {Neuron},
  volume =	 26,
  pages =	 {695--702},
  year =	 2000,
  title =	 {Adaptive Rescaling Maximizes Information
                  Transmission},
  author =	 {Naama Brenner and William Bialek and Rob de Ruyter
                  van Steveninck},
  pdf =		 {bio-learning/brenner_etal_00.pdf},
  abstract =	 {Adaptation is a widespread phenomenon in nervous
                  systems, providing flexibility to function under
                  varying external conditions. Here, we relate an
                  adaptive property of a sensory system directly to
                  its function as a carrier of information about input
                  signals. We show that the input/output relation of a
                  sensory system in a dynamic environment changes with
                  the statistical properties of the
                  environment. Specifically, when the dynamic range of
                  inputs changes, the input/output relation rescales
                  so as to match the dynamic range of responses to
                  that of the inputs. We give direct evidence that the
                  scaling of the input/output relation is set to
                  maximize information transmission for each
                  distribution of signals. This adaptive behavior
                  should be particularly useful in dealing with the
                  intermittent statistics of natural signals.},
  entred_on =	 {06/02/03},
  comments =	 {As the abstract says, the paper analyzes reponse of
                  the H1 neuron two slow and fast (compared to the fly
                  behavioral response time of 30ms) changing velocity
                  stimulus. In both cases it is evident that the fly
                  adpats its coding strategy to the stimulus, and it
                  looks that adaptation is such as to maximize the
                  information transfer.},
}

@article{deweese-zador-98,
  title =	 {Asymmetric Dynamics in Optimal Variance Adaptation},
  author =	 {Michael DeWeese and Anthony Zador},
  abstract =	 {It has long been recognized that sensory systems
                  adapt to their inputs. Here we formulate the problem
                  of optimal variance estimation for a broad class of
                  nonstationary signals. We show that under weak
                  assumptions, the Bayesian optimal causal variance
                  estimate shows asymmetric dynamics: an abrupt
                  increase in variance is more readily detectable than
                  an abrupt decrease. By contrast, optimal adaptation
                  to the mean displays symmetric dynamics when the
                  variance is held fixed. After providing several
                  empirical examples and a simple intuitive argument
                  for our main result, we prove that optimal
                  adaptation is asymmetrical in a broad class of model
                  environments. This observation makes specific and
                  falsifiable predictions about the time course of
                  adaptation in neurons probed with certain stimulus
                  ensembles.},
  journal =	 {Neural Comput.},
  volume =	 10,
  pages =	 {1179--1202},
  year =	 1998,
  pdf =		 {bio-learning/deweese-zador-98.pdf},
  entered_on =	 {05/30/03},
  comments =	 {The paper shows that it's more difficult to see a
                  variance decrease than the decrease (provided, tails
                  of the underlying distributions are not too
                  heavy). They use causal estimation, where the value
                  of the variance at the next point depends on only
                  the previous ones, and only through the last
                  point.  It is not obvious that, once a new datum
                  comes in, one should not revisit one's earlier
                  estimate, and do a re-estimation (batch
                  vs. online). Then the precedure will become similar
                  to Kalman filtering -- prediction/correction
                  scheme. This deserves further attention.},
}

@article{fairhall-02,
  title =	 {Efficiency and ambiguity in an adaptive neural code},
  author =	 {Adrienne L.~Fairhall and Geoffrey D.~Lewen and
                  William Bialek and Robert R.~de Ruyter van
                  Steveninck},
  abstract =	 {We examine the dynamics of a neural code in the
                  context of stimuli whose statistical properties are
                  themselves evolving dynamically. Adaptation to these
                  statistics occurs over a wide range of
                  timescales -- from tens of milliseconds to
                  minutes. Rapid components of adaptation serve to
                  optimize the information that action potentials
                  carry about rapid stimulus variations within the
                  local statistical ensemble, while changes in the
                  rate and statistics of action-potential ®ring encode
                  information about the ensemble itself, thus
                  resolving potential ambiguities. The speed with
                  which information is optimized and ambiguities are
                  resolved approaches the physical limit imposed by
                  statistical sampling and noise.},
  entered_on =	 {05/30/03},
  pdf =		 {bio-learning/fairhall_etal_02.pdf},
  journal =	 {Nature},
  year =	 2002,
  volume =	 {412},
  issue =	 23,
  pages =	 {787--792},
  comments =	 {The paper discsusses adaptation of the neural code
                  in the fly to velocity signal with zero mean
                  velocity and different variances. The following
                  effects are noticed. For periodic variance
                  modulation, adaptation seems to have a time scale
                  proportinal to the period. Adaptation to higher
                  variance happens faster tha to lower
                  variance. Information about the signal per spike is
                  (almost) the same irrespective of the signal
                  variance. It slightly drops when the variance is
                  abruptly shifted downwards, but then
                  recovers. Information per spike about the signal
                  variance seems to be constant. The paper suggests
                  that precise spike placement encodes precise
                  temporal structure of the signal, while the longer
                  (averaged) properties encode large scale attributes
                  of the signal (its variance). If the rate of changes
                  in the variance and local properties is about the
                  same (like in variance switching), then these two
                  are mixed with each other. Paper derives limits on
                  the physical limits on the speed of adaptation, but
                  I don't follow the derivation.}
}


@Article{gallistel-etal-01,
  journal =	 {Journal of Experimental Psychology: Animal Behavior
                  Processes},
  year =	 2001,
  volume =	 27,
  pages =	 {354--372},
  title =	 {The Rat Approximates an Ideal Detector of Changes in
                  Rates of Reward: Implications for the Law of Effect},
  author =	 {C.~R.~Gallistel and Terence A.~Mark and Adam King
                  and P.~E.~Latham},
  pdf =		 {bio-learning/gallistel_etal_01.pdf},
  abstract =	 {Rats responded on two levers delivering brain
                  stimulation reward on concurrent variable interval
                  schedules. Following many successive sessions with
                  unchanging relative rates of reward, subjects
                  adjusted to an eventual change slowly and showed
                  spontaneous reversions at the beginning of following
                  sessions. When changes in rates of reward occurred
                  between and within every session, subjects adjusted
                  to them about as rapidly as they could in principle
                  do so, as shown by comparison to a Bayesian model of
                  an ideal detector. This and other features of the
                  adjustments to frequent changes imply that the
                  behavioral effect of reinforcement depends on the
                  subject's perception of incomes and changes in
                  incomes rather than on the strengthening and
                  weakening of behaviors in accord with their past
                  effects or expected results. Models for the process
                  by which perceived incomes determine stay durations
                  and for the process that detects changes in rates
                  are developed.},
  entered_on =	 {04/07/2003, 03/10/2004},
  comments =	 {The paper deals with how rats are capable of feeling
                  changes in the statistics of probability
                  distribution of rewards and adapting to those
                  changes (the rewards are essentially Poisson, and
                  the rates change at some point). It has to be noted
                  that the paper only deals with rats' reaction to
                  changes; however, as Gallistel himself noted to me
                  it is unclear that the rat, even after feeling the
                  change in the rates, will quickly adapt its behavior
                  -- it may have its own reason for doing or not doing
                  so. In this sense the results of the paper are only
                  bounds on the real learning happening in
                  rats. Experiments with more pressing rewards (pain
                  maybe?) should be done to remove the lag (if any)
                  between learning about the change and acting upon
                  it. I find the following points in rats behavior
                  very interesting. (1) For random rewards, rats
                  apparently adopt a strategy of random visits to
                  different reward sites [IMHO, this is not well
                  proven in the paper: while stay times at a reward
                  site do look random, I don't see an analysis showing
                  that the binary sequence of visits (to one site, or
                  to the other) is also random]. (2) After a very long
                  period of constant rewards rate, it takes very long
                  time for a rat to adjust to rate changes; the
                  adjustments are very slow. If the change is large,
                  adjustment happens slightly faster. This smells like
                  a hysteresis effect (it's always been this way; if
                  it changes, it's only a fluke). (3) If a rate has
                  long been constant and then changed, and the rat has
                  started to adjust, and then the experiment is
                  stopped and restarted after a while, then the rat
                  reverts to the pre--change response (again, treating
                  the changes as a fluke). (4) If changes are many and
                  fast, the behavior adjustments are rapid (with very
                  short delay), almost discontinuous, and almost ideal
                  (though I don't buy the analysis completely -- see
                  below). (5) Transition between long adjustments and
                  step adjustments take only a handfull of
                  changes. (6) Some (but not very convincing) analysis
                  shows that the rat does not base its rates estimates
                  on very few events preceeding the decision;
                  filtering has long time scale to avoid statistical
                  fluctuations. (7) The rat may make (small)
                  spontaneous changes in its behavior and overadjust
                  for happening changes. (8) The paper argues that
                  expected changes in returns due to rat's adjustment
                  are to small to matter; the rat must be optimizing
                  something else. My biggest problems with the paper
                  are with its appendix -- Bayesian analysis of the
                  events. The model involves two constant rates and a
                  change in between. Similalry, the analysis of the
                  rats' response assumes that the rat changes its
                  behavior at some point rather abruptly. Why should a
                  rat have such a discrete model in its head?
                  Shouldn't it be doing continous filtering and
                  continuous adaptation of its response? In many
                  respects the experimental findings of the paper
                  resonate well with Knudsen's famous experiments of
                  prismatic glasses and owls. Though there is an
                  important difference: Fig. 4 of the paper says that
                  "the amount of prior experience with changes in
                  rates of reward does not determine how rapidly a
                  subject completes its adjustment to a change in the
                  relative rates of reward. What matters is the
                  frequency with which such changes have been
                  encountered recently. When changes have been
                  infrequent, the subject takes a long time to
                  complete its adjustment". In contrasts, old owl
                  which had experiences with glasses when young,
                  adapts to the new glasses in Knudsen's
                  experiments. Also, in the spirit of my work
                  \cite{nemenman-04}, maybe it's not only the
                  frequency of changes that matters, but also the
                  amplitude?}
}


@misc{chen-etal-04,
note = {arXiv: q-bio/0402021},
title= {An Exact Model of Fluctuations in Gene Expression},
author = {W Chen and J England and E Shakhnovich},
abstract ={Fluctuations in the measured mRNA levels of unperturbed
                  cells under fixed conditions have often been viewed
                  as an impediment to the extraction of information
                  from expression profiles. Here, we argue that 
                  such expression fluctuations should themselves be
                  studied as a source of valuable information about
                  the underlying dynamics of genetic networks. By
                  analyzin gmicroarray data taken from Saccharomyces
                  cerevisiae, we demonstrate that correlations in
                  expression fluctuations have a highly statistically
                  significant dependence on gene function, and
                  furthermore exhibit a remarkable scale-free network
                  structure. We therefore present what we view to be
                  the simplest phenomenological model of a genetic
                  network which can account for the presence of
                  biological information in transcript level
                  fluctuations. We proceed to exactly solve this model
                  using a path integral technique and derive several
                  quantitative predictions. Finally, we propose
                  several experiments by which these predictions might
                  be rigorously tested.},
year = 2004, 
pdf = {bionets/chen-etal-04.pdf},
comments = {The authors start with a clearly incorrect premise that
                  the noise observed in mRNA expression data is
                  largely due to the intrinsic stochastic
                  fluctuations. This is wrong -- even at a current
                  stage (2-3 years after the paper was written), the
                  largest source of noise is the population
                  variability and the hybridization variability, not
                  the intrinsic stochasticity. Note that in our ARACNE
                  work, the noise was due to the fluctuations of
                  kinetic rates, not due to the intrinsic fluctuations
                  -- thus we modeled cell-to-cell variability this
                  way. The current paper further
                  develops the work of Collins and Gardner on the
                  linear response of genesto small deviations from the
                  steady state by adding some noise to these linear
                  response equations. But they use unform noise?! And
                  uncorrelated among genes?! Further, eq 16 is clearly
                  true only for constant, uncorrelated noise -- the
                  relationship would  be much more complicated
                  otherwise. Once one allows the noise covariance
                  matrix to have >>1 independent elements, the
                  relation of the linear response coefficents to the
                  observed response fluctuations becomes dependent on
                  this entire multi-parameter unknown noise covariance
                  matrix, so that the relation is almost useless for
                  inference tasks. It's also surprizing that, given
                  that the authors here do just the FDT analysis for a
                  very specific noise spectrum, they never actually
                  mention FDT explicitely. The discussion after Eq 17
                  is interesting though, suggesting that if the
                  network of interactions is clustered, than the
                  network might be inferrable from the fluctuations.},
entered_on = {07/14/06},

}
@article{chen-wang-06,
title = {On the attenuation and amplification of molecular noise in
                  genetic regulatory 
                  networks},
journal = {BMC Bioinformatics},
year =  2006,
volume = 7,
pages = {52},
author = {B-S Chen and Y-C Wang},
abstract ={Background: Noise has many important roles in cellular
                  genetic regulatory functions at the nanomolar
                  scale. At present, no good theory exists for
                  identifying all possible mechanisms of genetic
                  regulatory networks to attenuate the molecular noise
                  to achieve regulatory ability or to amplify the
                  molecular noise to randomize outcomes to the
                  advantage of diversity. Therefore, the noise
                  filtering of genetic regulatory network is an
                  important topic for gene networks under intrinsic
                  fluctuation and extrinsic noise. Results: Based on
                  stochastic dynamic regulation equation, the
                  intrinsic fluctuation in reaction rates is modeled
                  as a state-dependent stochastic process, which will
                  influence the stability of gene regulatory network,
                  especially, with low concentrations of reacting
                  species. Then the mechanisms of genetic regulatory
                  network to attenuate or amplify extrinsic
                  fluctuation are revealed from the nonlinear
                  stochastic filtering point of view. Furthermore, a
                  simple measure of attenuation level or amplification
                  level of extrinsic noise for genetic regulatory
                  networks is also introduced by nonlinear robust
                  filtering method. Based on the global linearization
                  scheme, a convenient method is introduced to measure
                  noise attenuation or amplification for each gene of
                  the nonlinear stochastic regulatory network by
                  solving a set of filtering problems, which
                  correspond to a set of linearized stochastic
                  regulatory networks. Finally, by the proposed
                  methods, several simulation examples of genetic
                  regulatory networks are given to measure their
                  robust stability under intrinsic fluctuations, and
                  to estimate the genes’ attenuation and amplification
                  levels under extrinsic noises. Conclusions: In this
                  study, a stochastic nonlinear dynamic model is
                  developed for genetic regulatory networks under
                  intrinsic fluctuation and extrinsic noise. By the
                  method we proposed, we could determine the robust
                  stability under intrinsic fluctuations and identify
                  the genes that are significantly affected by
                  extrinsic noises, which we call the weak structure
                  of the network. This method will be potential for
                  robust gene circuit design in future, on which a
                  drug design could be based.}, 
pdf = {bionets/cheng-wang-06.pdf},
comments ={},
}

@article{klemm-bronholdt-05,
  journal =	 {Proc. Natl. Acad. Sci},
year = 2005,
volume =102,
pages = {18414-18419},
title= {Topology of biological networks and reliability of information processing},
abstract = {Survival of living cells and organisms is largely based on
                  highly reliable function of their regulatory
                  networks. However, the elements of biological
                  networks, e.g., regulatory genes in genetic networks
                  or neurons in the nervous system, are far from being
                  reliable dynamical elements. How can networks of
                  unreliable elements perform reliably? We here
                  address this question in networks of autonomous
                  noisy elements with fluctuating timing and study the
                  conditions for an overall system behavior being
                  reproducible in the presence of such noise. We find
                  a clear distinction between reliable and unreliable
                  dynamical attractors. In the reliable case,
                  synchrony is sustained in the network, whereas in
                  the unreliable scenario, fluctuating timing of
                  single elements can gradually desynchronize the
                  system, leading to nonreproducible behavior. The
                  likelihood of reliable dynamical attractors strongly
                  depends on the underlying topology of a
                  network. Comparing with the observed architectures
                  of gene regulation networks, we find that those
                  3-node subgraphs that allow for reliable dynamics
                  are also those that are more abundant in nature,
                  suggesting that specific topologies of regulatory
                  networks may provide a selective advantage in
                  evolution through their resistance against noise.},
pdf ={bionets/klemm-bronholdt-05.pdf},
comments = {The opening statements about reliability of neuronal
                  firing and protein concentrations are questionable,
                  just as many papers that they refer to. The paper is
                  not about information processing--there are never
                  any signals. Hill time, noiseless coupling is
                  considered, but with delay. (Why? either we have
                  well-mixed system, and then there's no delay, or the
                  system is inhomogeneous, and then PDEs are
                  needed. Or do they mean delay due to, say,
                  translation?) Noise is introduced by varying the
                  time delay (weird). Even stranger is the choice of
                  fluctuations: interchanging jumps up and down in the
                  delay with varying amplitudes. Some topologies
                  (i.e., feed forward) show a very small sensitivity
                  to varying delays (but, again, this has little to do
                  with signal processing). Feedback loops loos stable
                  dynamics as a function of fluctuations. },
author = {K Klemm and S Bornholdt},
entered_on = {05/18/06},
}
@article{eldar-etal-03,
  journal =	 {Developmental Cell},
  volume =	 5,
  pages =	 {635–646},
  year =	 2003,
  title =	 {Self-Enhanced Ligand Degradation Underlies
                  Robustness of Morphogen Gradients},
  author =	 {A Eldar and D Rosin and B-Z Shilo and N Barkai},
  pdf =		 {bionets/eldar-etal-03.pdf},
  abstract =	 {Morphogen gradients provide long-range positional
                  information by extending across a developing
                  field. To ensure reproducible patterning, their
                  profile is invariable despite genetic or
                  environmental fluctuations. Common models assume a
                  morphogen profile that decays exponentially. Here,
                  we show that exponential profiles cannot, at the
                  same time, buffer fluctuations in morphogen
                  production rate and define long-range gradients. To
                  comply with both requirements, morphogens should
                  decay rapidly close to their source but at a
                  significantly slower rate over most of the
                  field. Numerical search revealed two network designs
                  that support robustness to fluctuations in morphogen
                  production rate. In both cases, morphogens enhance
                  their own degradation, leading to a higher
                  degradation rate close to their source. This is
                  achieved through reciprocal interactions between the
                  morphogen and its receptor. The two robust networks
                  are consistent with properties of the Wg and Hh
                  morphogens in the Drosophila wing disc and provide
                  novel insights into their function.},
  comments =	 {The theory to achieve power law morphogen decays is
                  pretty simple. The screen for robust models,
                  however, is limited only to models that form a
                  superset of two well-known mogphogen signaling
                  systems (Hh and Wg). It is, therefore, not a suprize
                  that similar looking systems emerge as robust
                  networks. Some analysis is done to compare the
                  obtained robust models to the actual Hh and Wg, and
                  the comparison is mildly interesting. However, good
                  agreement is to be expected since the search for
                  robust models is within the class the includes,
                  basically, Hh and Wg as the basis.},
}

@article{lazebnik-04,
  title =	 {Can a Biologist Fix a Radio? - or, What I Learned
                  while Studying Apoptosis},
  author =	 {Y. Lazebnik},
  abstract =	 {This article by Yu. Lazebnik, "Can a Biologist Fix a
                  Radio?  — or, What I Learned while Studying
                  Apoptosis" has already been published in English
                  (Cancer Cell, 2002, 2, 179-182) and in Russian
                  (Uspekhi Gerontologii, 2003, No. 12,
                  166-171). Nevertheless, we have undertaken its
                  secondary publication in our journal for two
                  reasons: first, our journal has different readers,
                  and, second, the great significance of this manifest
                  of Yuri Lazebnik. The author in bright and clever
                  form shows the emerging necessity to create formal
                  ized language designed to describe complicated
                  systems of regulation of biochemical processes in
                  living cells. The article is published with
                  permission of Cancer Celland Uspekhi Gerontologii. },
  journal =	 {Biochemistry (Moscow)},
  volume =	 69,
  number =	 12,
  year =	 2004,
  pages =	 {1403-1406},
  entered_on =	 {12/20/05},
  pdf =		 {bionets/lazebnik-04.pdf},
  comments =	 {Worth a read.},
}


@InProceedings{bialek-01,
  author =	 {William Bialek},
  title =	 {Stability and Noise in Biochemical Switches},
  booktitle =	 {Advances in Neural Information Processing Systems
                  13},
  pages =	 {103--109},
  year =	 {2001},
  editor =	 {Leen, Todd K. and Dietterich, Thomas G. and Tresp,
                  Volker},
  publisher =	 {MIT Press}
}


@article{dekel-alon-05,
  title =	 {Optimality and evolutionary tuning of the expression
                  level of a protein},
  author =	 {Erez Dekel and Uri Alon},
  abstract =	 { Different proteins have different expression
                  levels. It is unclear to what extent these
                  expression levels are optimized to their
                  environment. Evolutionary theories suggest that
                  protein expression levels maximize fitness 1Ð11, but
                  the fitness as a function of protein level has
                  seldombeen directly measured. To address this, we
                  studied the lac systemof Escherichia coli, which
                  allows the cell to use the sugar lactose for growth
                  12. We experimentally measured the growth burden
                  13,14 due to production and maintenance of the Lac
                  proteins (cost), as well as the growth advantage
                  (benefit) conferred by the Lac proteins when lactose
                  is present. The fitness function, given by the
                  difference between the benefit and the cost,
                  predicts that for each lactose environment there
                  exists an optimal Lac expression level that
                  maximizes growth rate. We then performed serial
                  dilution evolution experiments at different lactose
                  concentrations. In a few hundred generations, cells
                  evolved to reach the predicted optimal expression
                  levels. Thus, protein expression from the lac operon
                  seems to be a solution of a costÐbenefit
                  optimization problem, and can be rapidly tuned by
                  evolution to function optimally in new
                  environments.},
  journal =	 {Nature},
  volume =	 436,
  number =	 28,
  year =	 2005,
  pages =	 {588-562},
  pdf =		 {bionets/dekel-alon-05.pdf},
  entered_on =	 {07/28/05},
  comments =	 {This is one of very few papers I know that does
                  explicit evolution experiments to show that
                  organisms are adapted by evolution (either on
                  genetic or epigenetic level; this has not been ruled
                  on definitely in the paper) to be at some optimal
                  evolutionary state. As much as I like the idea, the
                  implementation is quite raw, even though probably
                  "good enough" for a first try. In these experiments,
                  the Lac system is fully induced (irrespectively of
                  the Lac concentration) by the presence of IPTG; thus
                  evolution takes the place of the usual LacY/Z
                  experession control by galactose (wouldn't the
                  optimal solution in this case be to start metabolize
                  IPTG, or do some other drastic genetic change?) The
                  cost and benefit are measured by the growth
                  rate. This is probably fine, though one has to make
                  sure that effects like inhomogeneous dilution do not
                  favor bacterias that do not grow fast, but know
                  where to be when the dilution starts. The measuring
                  of costs/benefits is more troublesome. The cost of
                  LacZ expression is measured by changing LacZ by
                  varying the level of IPTG. Is it clear that this
                  cost is the same irrespectively of whether the Lac
                  operon is activated by IPTG or by lactose? Since
                  IPTG is non-metabolizable, it is reasonable to
                  believe that the resources available to the cell
                  under IPTG induction are smaller than those under
                  lactose induction, and, therefore, the variable M in
                  Eq 2, changes as the function of the lactose
                  fraction in the lactose/IPTG mixture (see also Supp
                  Fig 8). When measuring the benefits of LacZ
                  expression, this is done at saturating levels of
                  IPTG+lactose (0.2mM lactose fully induces Lac
                  operon). Again, is it clear that the benefit of a
                  fully expressed LacZ system as a function of the
                  lactose concentration is independent of whether the
                  IPTG is present or not. Maybe it is effluxed, or
                  maybe it otherwise decreases on increases the
                  benefits? Suppl Figure 4 shows that the growth rate
                  for different patches of the same bacteria vary by
                  more than the mean growth rates between different
                  bacteria involved in the experiment. The populations
                  are inhomogeneous. Is it then reasonable to talk
                  about the average growth rate benefit, and study the
                  cells' optimality, when the cells are so variable?
                  Single cell experiments are needed. When looking at
                  the main text Figure 4b and Fig 9c in the
                  supplement, we see that the agreement of
                  evolutionary results with the theory is qualitative
                  at best. For linear benefits and singular losses,
                  the theory/experiment agree only at 0 lactose and at
                  saturation, and there is >5std.dev. error at the
                  only mid-point measured. When the benefits are
                  corrected past the linear regime with all the glory
                  of the chemical kinetics, the error in the middle
                  drops to about 3 stddev, while a consistent 2
                  stddeev error emerges for saturation values. Things
                  are even worse for the cost function -- miniscule
                  changes in its shape produce dramatic effects on the
                  best LacZ expression. The singular resource limited
                  cost seems to work more or less, but do we really
                  trust such fine tuning? Evolutionary modeling seem
                  fine, but, again, disagreements with experiments are
                  of the order of many error bars. it is probably not
                  true that this evolution can be modeled by emergence
                  of a single optimum trait; more complex processes
                  are probably important. In general this work suffers
                  from the problem that the optimal response is very
                  much dependent on "what the cell wants," that is, on
                  the unknown costs and benefits. We need evolutionary
                  experiments that would be free of making assumptions
                  about the utility of the behavior to the cell. Final
                  note: in the supplement, it is said that lactose and
                  glycerol are utilized simultaneously.},
}

@article{cimino-hervagault-87,
  journal =	 {Biochem. Biophys. Res. Commun.},
  volume =	 149,
  number =	 2,
  year =	 1987,
  pages =	 {615--620},
  title =	 {Experimental evidence for zero-order
                  ultrasensitivity in a simple substrate cycle},
  author =	 {A Cimino and J-F Hervagault},
  abstract =	 {It was shown [Goldbeter and Koshland, 1981, PNAS]
                  that amplified sensitivity may arise in reversible
                  covalent modification systems, where the converter
                  enzymes operate in their zero-order region, we show
                  that "zero-order ultrasensitivity" may also occur in
                  simple substrate cycles. The experimental study
                  deals with the Formate/Lactic dehydrogenases model
                  cycle, interconverting the reduced and oxidized
                  forms of NAD. For NAD(H) concentrations high enough
                  (with respect to the enzyme K_M's, abrupt changes in
                  the steady-state substrate concentrations may result
                  from small variations in the ratio of maximal enzyme
                  activities. The amplification factors are
                  measured. Implications in metabolic regulation are
                  also taken up.},
  comments =	 {Push-pull loop considered (experimentally, and
                  rudimentary theory). It's shown that the steady
                  state concentration of the activated circulating
                  enzyme may be quite sensitive to the parameters of
                  the biochemistry.},
}
@article{wall-etal-05,
  author =	 {M Wall and M Dunlop and W Hlavacek},
  title =	 {Multiple functions of a feed-forward-loop gene
                  circuit},
  journal =	 { J. Mol. Biol.},
  year =	 2005,
  volume =	 {XXX},
  pages =	 {XXX},
  abstract =	 {The feed-forward-loop (FFL), a network motif in
                  genetic regulatory networks, involves two
                  transcription factors (TFs): one regulates the
                  expression of the second, and both TFs regulate the
                  expression of the effector gene. Analysis of FFL
                  design principles has been initiated, but the
                  functional significance of the FFL is still
                  unclear. In theoretical studies so far, the TFs are
                  assumed to interact with different signals, which is
                  common. However, we have found examples of FFLs in
                  E. coli in which both TFs interact with the same
                  signal. These examples belong to the repressor of
                  transcription. Here, we analyze mathematical models
                  of this class of circuits, examining a comprehensive
                  array of subclasses that differ in the way a signal
                  modulates the activities of the TFs. Through
                  parameter variation, we characterize statistically
                  how input/output (I/O) behavior and temporal
                  responsiveness are predicted to depend on the wiring
                  of qualitatively distinct steady-state I/O patterns,
                  including inducible and repressible patterns. Some
                  subclasses exhibit as many as six
                  patterns. Thansient pulses are also possible, and
                  the response of a circuit to a signal may be either
                  faster or slower than that of a gene circuit in
                  which there is only one TF. Our results provide a
                  catalog of functions for a class of FFL circuits,
                  whose subclasses have different breadths of possible
                  behaviors and different typical behaviors.},
  url =		 {http://EcoTFs.lanl.gov},
  comments =	 {Zoology of a completely repressive FFL. No noise,
                  but all parameter values are
                  considered. Importantly, response of the cell to an
                  external signal (not to change in concentration of
                  the original promoter) is considered. This is
                  similar to ourown work, and is a good step above
                  Alon's work. A whole zoo of behaviors is seen as a
                  function of these parameters. Thus, there is no
                  unique function of a FFL.},
}

@article{bray-95,
  pdf =		 {bionets/bray-95.pdf},
  title =	 {Protein molecules as computational elements in
                  living cells},
  author =	 {D Bray},
  journal =	 {Nature},
  volume =	 376,
  year =	 1995,
  pages =	 {307-312},
  abstract =	 {Many proteins in living cells appear to have as
                  their primary function the transfer and processing
                  of information, rather than the chemical
                  transformation of metabolic intermediates or the
                  building of cellular structures. Such proteins are
                  functionally linked through allosteric or other
                  mechanisms into biochemical 'circuits' that perform
                  a variety of simple computational tasks including
                  amplification, integration and information storage.},
  comments =	 {Reference to 1977 Stadtman papers (ref 14, 15) --
                  seems like first <em>theoretical</em> theoretical
                  introduction to push-pull cascades. Also look at ref
                  13 by Arkin and Ross. Reference 24 -- building a
                  universal computer from chemical gates; chemical
                  computers are as good as neuronal ones.}
}

@article{sachs-etal-05,
  title =	 {Causal Protein-Signaling Networks Derived from
                  Multiparameter Single-Cell Data},
  author =	 {K Sachs and O Perez and D Pe'er and D Lauffenburger
                  and G Nolan},
  abstract =	 {Machine learning was applied for the automated
                  derivation of causal influences in cellular
                  signaling networks. This derivation relied on the
                  simultaneous measurement of multiple phosphorylated
                  protein and phospholipid components in thousands of
                  individual primary human immune system
                  cells. Perturbing these cells with molecular
                  interventions drove the ordering of connections
                  between pathway components, wherein Bayesian network
                  computational methods automatically elucidated most
                  of the traditionally reported signaling
                  relationships and predicted novel interpathway
                  network causalities, which we verified
                  experimentally. Reconstruction of network models
                  from physiologically relevant primary single cells
                  might be applied to understanding native-state
                  tissue signaling biology, complex drug actions, and
                  dysfunctional signaling in diseased cells.},
  journal =	 {Science},
  year =	 2005,
  volume =	 308,
  pages =	 {523-529},
  pdf =		 {bionets/sachs-etal-05.pdf},
  comments =	 {Application of BN's to flow cytometry data for
                  signaling networks reconstruction. *Nothing* new is
                  uncovered in biological realm. Why did this get
                  published?},
}

@unpublished{gottschling-05,
  title =	 {Loss of Heterozygosity in Old Yeast: A Model for
                  Age-Induced Cancer},
  url =
                  {http://online.kitp.ucsb.edu/online/bio05/gottschling},
  author =	 {D Gottschling},
  note =	 {Talk at Bio05 at KITP},
  comments =	 {Yeasts show aging, and they stop dividing after
                  about 40 divisions. This is explained by
                  accumulation of epigenetic mutations (bad proteins),
                  which are retained in the mother cell, while the
                  daughter cell starts life fresh.},
}

@unpublished{rothman-05,
  url =		 {http://online.kitp.ucsb.edu/online/bio05/rothman/},
  author =	 {J Rothman},
  note =	 {Talk at Bio05 at KITP},
  comments =	 {For C. elegance, what is the network that regulates
                  apoptosis? High thoughput and detailed measurements
                  are used. Experiments are noon-quantittaive, but, at
                  best, binary. They then use Bayesian networks
                  (maximum spanning + cutoffs) to infer the regulatory
                  network of cell death. Not the best choice. Singh
                  and Jammalammadaka from UCSB have worked on this as
                  well with this guy. Don't think there's a great
                  opprtunity for quantitative biology here. },
  title =	 {Life or Death Decisions: Regulation of Programmed
                  Cell Death in C. Elegans},
}

@article{thattai-shraiman-03,
  pages =	 {744-754},
  journal =	 {Biophy. J.},
  volume =	 85,
  year =	 2003,
  title =	 {Metabolic Switching in the Sugar Phosphotransferase
                  System of Escherichia coli},
  author =	 {M Thattai and B Shraiman},
  abstract =	 {Bacteria grown in a mixture of multiple sugars will
                  first metabolize a preferred sugar until it is
                  nearly depleted, only then turning to other carbon
                  sources in the medium. This sharp switching of
                  metabolic preference is characteristic of systems
                  that optimize fitness. Here we consider the
                  mechanism by which switching can occur in the
                  Escherichia coli phosphotransferase system (PTS),
                  which regulates the uptake and metabolism of several
                  sugars. Using a model combining the description of
                  fast biochemical processes and slower genetic
                  regulation, we derive metabolic phase diagrams for
                  the uptake of two PTS sugars, indicating regions of
                  distinct sugar preference as a function of external
                  sugar concentrations. We then propose a
                  classification of bacterial phenotypes based on the
                  topology of the metabolic phase diagram, and
                  enumerate the possible topologically distinct
                  phenotypes that can be achieved through mutations of
                  the PTS. This procedure reveals that there is only
                  one nontrivial switching phenotype that is
                  insensitive to large changes in biochemical
                  parameters. This phenotype exhibits diauxic growth,
                  a manifestation of the winner-take-all dynamics
                  enforced by PTS architecture. Winner-take-all
                  behavior is implemented by the induction of
                  sugar-specific operons, combined with competition
                  between sugars for limited phosphoryl flux. We
                  propose that flux-limited competition could be a
                  common mechanism for introducing repressive
                  interactions in cellular networks, and we argue that
                  switching behavior similar to that described here
                  should occur generically in systems that implement
                  such a mechanism. },
  pdf =		 {bionets/thattai-shraiman-03.pdf},
  entered_on =	 {02/04/05},
  comments =	 {Good ref. to Edwards et. al 2001 on metabolic flux
                  balancing in E Coli. Good point that optimal flux
                  balancing may produce huge rebalancing for tiny
                  environmental changes. Maybe that's why the negative
                  feedback PTS loop is needed? PTS is designed to be
                  Winner-Take-All (only this nontrivial behavior is
                  realized under many different parameter values
                  involved in the model, and, therefore, is robust to
                  mutations). Simple derivation of why the WTA is good
                  when metabolizing many sugars. Don't quite get Eq. 4
                  (the nonlinear dependence of the influx of sugar on
                  the expression of the permease, which is shown in
                  this Eq. is essential for switching in PTS; other
                  nonlinearity would also do well). The paper looks at
                  almost instantaneous equilibration at the PTS point,
                  and then slow dynamics of expression. I am instead
                  interested in analysis which is quasistationary wrt
                  genetics, and calculating responses at the PTS time
                  scale. Paper assumes that the PTS sugar intake
                  saturates faster than enzyme expressions as a
                  function of the intracellular sugar concentrations,
                  so that expression can be treated as linear
                  functions of intracellular sugar. Due to effect of
                  the permease (additional positive feedback loop),
                  they get hysteretic switching even in 1 sugar
                  regime. Did Savageau LacY into the account? How does
                  this agree with their own recent nature paper that
                  notices absence of hysteresis for lactose (as
                  compared to TMG). Good point that continuous,
                  nonswitching behavior, may be obtained by changing
                  depmands for sugars, which can be done through
                  cap/camp mechanism, which will increase production
                  of all permeases and requests for all sugars, but
                  not proportionally. Robust but tunable -- "precise
                  adaptation is a robust property of the system, while
                  adaptation times can be tuned by modification of
                  protein levels." Point on suppression of cross-talk
                  in chemotaxis by means of WTA architecture. PTS
                  chemical kinetics appendix -- useful. The EIIA/BC
                  seems to be the bottleneck in sugar transfers -- see
                  number in table 1. After reading this, the view  of
                  the lac system is as follows: the fast metabolic
                  positive feed back loop through PEP is basically a
                  signal predictor,  a low pass filter, trying to
                  account for the uptake at the next moment of time
                  given the previous data. Then there is possibly a
                  negative feedback loop
                  \cite{bruckner-titgeymeyer-02}, and a bunch of
                  slower genetic controls -- positive feedback through
                  transferases and negative feedback through
                  metabolizing enzymes. One needs to figure out if the
                  system filters out high frequency noise, then
                  predicts the future signal, and allows to respond to
                  it in such a way that the slow genetic control
                  adapts the steady state of the system to the average
                  sugar concentration and also possibly to the sugar
                  variance. This is getting interesting.},
}

@article{vilar-etal-03,
  title =	 {Modeling network dynamics: the lac operon, a case
                  study},
  author =	 {JMG Vilar and C Guet and S Leibler},
  journal =	 {J. Cell Biol.},
  volume =	 161,
  number =	 3,
  year =	 2003,
  pdf =		 {bionets/vilar-etal-03.pdf},
  entered_on =	 {02/02/05},
  abstract =	 {We use lac operon in Escherichia coli as a prototype
                  system to illustrate the current state,
                  applicability, and limitations of modeling dynamics
                  of cellular networks. We integrate three different
                  levels of description (molecular, cellular, and that
                  of cell population) into a single model, which seems
                  to capture many experimental aspects of the system.},
  comments =	 {They refer to 1957 experiments of Novick and Weiner
                  stating that Lac circuit is a bistable
                  system. According to \cite{ozbudak-etal-04},
                  specifically the supplmentary info there, this
                  system is not bistable (the 1957 paper might have
                  used TMG or other gratuitous inducers as well, then
                  there would be no contradiction). Specific comments:
                  Molecular modeling -- since the induces in the Lac
                  system is actually metabolized (this is alalactose,
                  not actually lactose, and it is metabolized into
                  glucose), there is never a situation (even when the
                  permease, LacY, is knocked out), when the
                  concentration of Lac inside and outside the cell is
                  equilibrated at the same level. Good literature
                  review (e.g., Ito and Akiyama, 1999 -- permease
                  activity measurements; Liu et al -- efflux glucose
                  pumps in the system). One of the important points
                  for us to discuss -- what is the dynamics for the
                  permease to get active (i.e., to bind to the cell
                  surface). In the description of the model, it's
                  unclear why there is no term in the equation for Y
                  describing process from Y to Yf. Good discussion of
                  stochastic induction. },
}

@article{bruckner-titgemeyer-02,
  journal =	 {FEMS Microbiolofy Letters},
  volume =	 209,
  year =	 2002,
  pages =	 {141--148},
  title =	 {Carbon catabolite repression in bacteria: choice of
                  the carbon source and autoregulatory limitation of
                  sugar utilization},
  author =	 {Reinhold Bruckner and Fritz Titgemeyer},
  abstract =	 {Carbon catabolite repression (CCR) in bacteria is
                  generally regarded as a regulatory mechanism to
                  ensure sequential utilization of
                  carbohydrates. Selection of the carbon sources is
                  mainly made at the level of carbohydrate-specific
                  induction. Since virtually all carbohydrate
                  catabolic genes or operons are regulated by specific
                  control proteins and require inducers for high level
                  expression, direct control of the activity of
                  regulators or control of inducer formation is an
                  efficient measure to keep them silent. By these
                  mechanisms, bacteria are able to establish a
                  hierarchy of sugar utilization. In addition to the
                  control of induction processes by CCR, bacteria have
                  developed global transcriptional regulation
                  circuits, in which pleiotropic regulators are
                  activated. These global control proteins, the
                  catabolite gene activator protein (CAP), also known
                  as cAMP receptor protein, in Escherichia coli or the
                  catabolite control protein (CcpA) in Gram-positive
                  bacteria with low GC content, act upon a large
                  number of catabolic genes/operons. Since practically
                  any carbon source is able to trigger global
                  transcriptional control, expression of sugar
                  utilization genes is restricted even in the sole
                  presence of their cognate substrates. Consequently,
                  CAP- or CcpA-dependent catabolite repression serves
                  as an autoregulatory device to keep sugar
                  utilization at a certain level rather than to
                  establish preferential utilization of certain carbon
                  sources. Together with other autoregulatory
                  mechanisms that are not acting at the gene
                  expression level, CCR helps bacteria to adjust sugar
                  utilization to their metabolic
                  capacities. Therefore, catabolic/metabolic balance
                  would perhaps better describe the physiological role
                  of this regulatory network than the term catabolite
                  repression.},
  pdf =		 {bionets/bruckner-titgemeyer-02.pdf},
  entered_on =	 {12/09/04},
  comments =	 {The reason for looking into this paper for me was to
                  find if there are negative feedback loops that would
                  stabilize lactose uptake or metabolization for sharp
                  and fast peaks in extracellular lactose
                  concentration. We are looking for
                  post-transcriptional regulation of such
                  kind. Options are: for stabilizing uptake -- (1) can
                  intracellular glucose from the lac-glu cascade be
                  phosphorelated by EIIAglc, so that EIIAglc then can
                  inhibit effect of LacY (lac permease)? or maybe (2)
                  activity HPr, which supplies phosphor to EIIAglc, is
                  enzymatically controlled? A possible way for this is
                  metabolism of the internalized carbon sources, which
                  determines the ratio of phosphoenolpyruvate to
                  pyruvate, which influences, via EI and HPr, the
                  phosphorylation state of EIIAglc. Slower,
                  transcriptional, controls may come from cAMP, whose
                  activity depends on glucose through phosporelation
                  of EIIAglc, which is not phosphorelated for large
                  glucose concentration. There are also possible slow
                  posttranscriptional but pretranslational controls
                  (stability of mRNA is controlled by CsrA, which is
                  effected by CAP). Note that HPr and EIIAglc are on
                  the same operon, and thus expressed
                  similarly. Another transcriptional loop is
                  (phosphorelated EIIAglc) activates CAP (that is,
                  small gllucose --> increase of lactose uptake). Note
                  on the presence of glucose in the cell and its
                  effect on phosphorelation state of EIIAglc:
                  "Surprisingly, glucose 6-phosphate and several other
                  non-PTS substrates caused dephosphorylation of
                  EIIAglc to various extents. To explain these
                  unexpected results, the authors proposed the PEP to
                  pyruvate ratio to be involved in the control of
                  EIIAglc phosphorylation state. The PEP/pyruvate
                  ratio in turn is in£uenced by metabolism of
                  carbohydrates." Another feedback loop is through the
                  glycolytic pathway (page 5, left column); this needs
                  to be looked at. In section 5 they start talking
                  about autoregulion aspects of the circuit, and say
                  that even in the presence of lactose Lac is not
                  fully activated -- some kind of autoregulation. "In
                  a mutant strain with a lactose permease that is
                  insensitive to EIIAglc-mediated inhibition, lactose
                  became inhibitory for growth, when lac gene
                  expression was additionally stimulated by exogenous
                  cAMP" -- there is some kind of probably metabolic
                  penalty for full expression of Lac and its
                  overactive utilization. Reason for CCR is believed
                  to be the desire to get a fixed amount of sugar with
                  the least gene expression -- "If only one carbon
                  source is available, CCR ... is the inevitable
                  consequence of carbohydrate metabolism acting as a
                  built-in autoregulatory device to limit carbohydrate
                  consumption." Need to get refs. 2, 6, 28, 29, 45.},
}

@article{wong-etal-97,
  title =	 {Mathematical Model of the lac Operon: Inducer
                  Exclusion, Catabolite Repression, and Diauxic Growth
                  on Glucose and Lactose},
  author =	 {Patrick Wong and Stephanie Gladney and JD Keasling},
  abstract =	 { A mathematical model of the lactose (lac) operon
                  was developed to study diauxic growth on glucose and
                  lactose. The model includes catabolite repression,
                  inducer exclusion, lactose hydrolysis to glucose and
                  galactose, and synthesis and degradation of
                  allolactose. Two models for catabolite repression
                  were tested: (i) cyclic AMP (cAMP) synthesis
                  inversely correlated with the external glucose
                  concentration and (ii) synthesis inversely
                  correlated with the glucose transport rate. No
                  significant differences in the two models were
                  observed. In addition to synthesis, degradation and
                  secretion of cAMP were also included in the
                  model. Two models for the phosphorylation of the
                  glucose produced from lactose hydrolysis were also
                  tested: (i) phosphorylation by intracellular
                  hexokinase and (ii) secretion of glucose and
                  subsequent phosphorylation upon transport back into
                  the cell. The latter model resulted in weak
                  catabolite repression when the glucose produced from
                  lactose was transported out of the cell, whereas the
                  former model showed no catabolite repression during
                  growth on lactose. Parameter sensitivity analysis
                  indicates the importance of key parameters to lac
                  operon expression and cell growth: the lactose and
                  allolactose transformation rates by  -galactosidase
                  and the glucose concentrations that affect
                  catabolite repression and inducer exclusion. Large
                  values of the allolactose hydrolysis rate resulted
                  in low concentrations of allolactose, low-level
                  expression of the lac operon, and slow growth due to
                  limited import and metabolism of lactose; small
                  values resulted in a high concentration of
                  allolactose, high-level expression of the lac
                  operon, and slow growth due to a limiting
                  concentration of glucose 6-phosphate formed from
                  allolactose. Changes in the rates of all
                  -galactosidase-catalyzed reactions showed similar
                  behavior, but had more drastic effects on the growth
                  rate. Changes in the glucose concentration that
                  inhibited lactose transport could extend or contract
                  the diauxic growth period during growth in the
                  presence of glucose and lactose. Moreover, changes
                  in the glucose concentration that affected
                  catabolite repression affected the cAMP levels and
                  lac operon expression, but had a lesser effect on
                  the growth rate. },
  journal =	 {Biotechn. Progr.},
  year =	 1997,
  volume =	 13,
  pages =	 {132--143},
  pdf =		 {bionets/wong-etal-97.pdf},
  entered_on =	 {12/09/04},
  comments =	 {Reasonably detailed Lac repressor model. Simulations
                  that describe how a colony would grow given
                  different mechanisms for metabolization of
                  lactose-derived glucose, and a few other things. No
                  experimental data, but good review of literature
                  with various experimental numbers.},
}


@article{smolen-etal-99,
  title =	 {Effects of macromolecular transport and stochastic
                  fluctuations on dynamics of genetic regulatory
                  systems},
  author =	 {P Smolen and DA Baxter and JH Byrne},
  journal =	 {Am. J. Physiol.},
  volume =	 {277 (Cell Physiol. 46)},
  pages =	 {C777-C790},
  year =	 1999,
  abstract =	 {To predict the dynamics of genetic regulation, it
                  may be necessary to consider macromolecular
                  transport and stochastic fluctuations in
                  macromolecule numbers. Transport can be diffusive or
                  active, and in some cases a time delay might suffice
                  to model active transport. We characterize major
                  differences in the dynamics of model genetic systems
                  when diffusive transport of mRNA and protein was
                  compared with transport modeled as a time
                  delay. Delays allow for history-dependent,
                  non-Markovian responses to stimuli (i.e.,
                  ÔÔmolecular memoryÕÕ). Diffusion suppresses
                  oscillations, whereas delays tend to create
                  oscillations. When simulating essential elements of
                  circadian oscillators, we found the delay between
                  transcription and translation necessary for
                  oscillations. Stochastic fluctuations tend to
                  destabilize and thereby mask steady states with few
                  molecules. This computational approach, combined
                  with experiments, should provide a fruitful
                  conceptual framework for investigating the function
                  and dynamic properties of genetic regulatory
                  systems.},
  entered_on =	 {12/05/04},
  pdf =		 {bionets/smolen-etal-99.pdf},
  comments =	 {Effects of diffusion vs active transport on
                  biochemical reactions. However, since diffusion time
                  in a cell is actually quite small (often smaller
                  than most other times involved) should we even
                  bother with considering how a molecule finds its
                  place in the cell? Decent literature collection on
                  how steady states may loose stability due to effects
                  of transportation delay (mostly oscillatory states
                  start to emerge), on effects of low-copy-number
                  stochasticity (Need to download
                  \cite{mcadams-arkin-97}.), and also on emergence of
                  various behaviors (oscillation, multistability,
                  etc.) in various circuits. The time-delayed systems
                  behave as intuitively expected -- no surprizes
                  here. Delay introduces memory; but in 1-d (time
                  only) system, any short term memory is not important
                  for low term behavior of the system
                  (Mermin-Wagner). All analysis is numerical in this
                  paper.}
}

@article{mcadams-arkin-97,
  author =	 {McAdams and Arkin},
  title =	 {Stochastic mechanisms in gene expression},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  volume =	 94,
  pages =	 {814-819},
  year =	 1997,
  comments =	 {TO GET},
}

@article{savageau-02,
  title =	 {Alternative designs for a genetic switch: Analysis
                  of switching times using the piecewise power-law
                  representation},
  author =	 {Michael A. Savageau},
  abstract =	 {Some genes are thought to be switched
                  discontinuously ON or OFF in response to
                  environmental or developmental stimuli, whereas
                  other genes are thought to be switched in a
                  continuously variable fashion. We have previously
                  identified criteria that distinguish between
                  discontinuous and continuous genetic switches for an
                  inducible catabolic pathway. These two types of
                  switches exhibit several additional characteristics,
                  beyond their qualitatively distinct behaviors, that
                  influence their natural selection. These
                  characteristics include threshold value, magnitude
                  of the input signal required for switching
                  (Ôswitching effortÕ), magnitude of the corresponding
                  output signal, duty cycle, switching time, and
                  robustness. In order to characterize the biological
                  design principles governing such switches, we have
                  developed mathematical models of generic gene
                  circuits and analyzed their behavior. Here we report
                  the results of a comparative study designed to
                  identify essential differences in switching
                  time. This study has been greatly facilitated by use
                  of the piecewise power-law representation, which was
                  first developed by systems engineers in the 1940s
                  and adapted for biochemical systems in the early
                  1970s. With this approach, we have been able to
                  derive analytical expressions for switching
                  time. When the alternative designs are made as
                  nearly equivalent as possible, by the method of
                  mathematically controlled comparison, we find that
                  the switching times for the continuous case are less
                  than that for the corresponding discontinuous
                  case. We also find that ON times are faster than OFF
                  times in all cases. These results are discussed in
                  the specific context of the inducible lactose operon
                  of Escherichia coli.},
  journal =	 {Mathematical Biosciences},
  volume =	 180,
  year =	 2002,
  pages =	 {237--253},
  pdf =		 {bionets/savageau-02.pdf},
  entered_on =	 {12/05/04},
  comments =	 {Continuous (static) and discontinuous-hysteretic
                  (dynamic) switches are analyzed to check which one
                  would switch faster, all nonessential parameters
                  being kept equal. The dynamic switch is built from a
                  positive feedback loop, while for a static switch
                  induction of a concentration of a particular enzyme
                  leads to both intake and degradation of the inducer
                  (this is akin to E Coli -- an enzyme transfers
                  lactose across membrane and then degrades it). Such
                  system will not necessarily produce continuous
                  switching, but it is possible for some set of
                  parameters. It turns out that continuous switch is
                  actually faster switching; additionally it switches
                  ON faster than OFF -- a good quality for nutrient
                  consumption with small dilution. Hysteretic switches
                  are thus bad for metabilism (no graded response),
                  while good for, for example, phenotype
                  differentiation. One should consider whether it is,
                  in general, possible to produce a hysteretic switch
                  with a metabolizable inducer, specifically for
                  natural(istic) stimuli distribution. If the switch
                  has only a few molecules, one might develop
                  back-and-forth switching and get a graded response
                  from a sigma-delta modulator type of behavior even
                  for a hysteretic switch. In a piece-wise linear
                  analysis this should be analyzeable by keeping in
                  mind that due to various feedback adaptation loops
                  one will probably have a good match between the
                  dynamic range of the inducer and the width of the
                  hysteric region (matched response). So a good
                  analysis is to consider that a cell fluctuated
                  inside the hysteretic box with a few switch
                  molecules and see how fast would this circuit
                  respond due to randomly switching between the states.},
}

@article{ozbudak-etal-04,
  title =	 {Multistability in lactose utitlization network of
                  escherichia coli},
  author =	 {E Ozbudak and M Thattai and H Lim and B Shraiman and
                  A van Oudenaarden},
  journal =	 {Nature},
  pages =	 {737--740},
  year =	 2004,
  volume =	 427,
  pdf =		 {bionets/ozbudak-etal-04.pdf},
  comments =	 {<a
                  href="bionets/ozbudak-etal-04-suppl.pdf">Supplement</a>. Somewhat
                  misleading paper -- they talk about bistability of
                  lactose circuit, but lactose circuit is actually not
                  bistable, only TMG/IPTG is. Hysteresis is not shown
                  in single cells, only in bacterial
                  populations. Effects of noise on the theoretical
                  model are not analyzed (maybe theoretic model stops
                  to be bistable with noise?).},
  entered_on =	 {09/20/04},
  abstract =	 {Multistability, the capacity to achive multiple
                  internal states in a response to a single set of
                  external inputs, is the defining characteristic of a
                  switch. Biological switches are essential for the
                  determination of cell fate in multicellular
                  organisms, the regulation of cell-cycle oscillations
                  during mitosis and the maintenance of epigenetic
                  traits in microbes. The multistability of several
                  natural and synthetic systems has been attributed to
                  positive feedback loops in their regulatory
                  networks. However, feedback alone does not guarantee
                  multistability. The phase diagram of a multistable
                  system, a concise description of internal states as
                  key parameters are varied, reveals the conditions
                  required to produce a functional switch. Here we
                  present the phase diagram of the bistable lactose
                  utilization network of Escherichia coli. We use this
                  phase diagram, coupled with a mathematical model of
                  the network, to quantitatively investigate processes
                  such as sugar uptake and transcriptional regulation
                  in vivo. We then show how the hysteretic response of
                  the wildtype system can be converted to an
                  ultrasensitive graded response. The phase diagram
                  thus serves as a sensitive probe of molecular
                  interactions and as a powerful tool for rational
                  network design.}
}

@misc{agrawal-03,
  note =	 {arXiv:q-bio.MN/0309019},
  title =	 {Scaling in Counter Expressed Gene Networks
                  Constructed from Gene Expression Data},
  author =	 {Himanshu Agrawal},
  pdf =		 {bionets/agrawal-03.pdf},
  url =		 {http://xxx.lanl.gov/abs/q-bio/0309019},
  entered_on =	 {03/17/04},
  abstract =	 {We study counter expressed gene networks constructed
                  from gene-expression data obtained from many types
                  of cancers. The networks are synthesized by
                  connecting vertices belonging to each others' list
                  of K-farthest-neighbors, with K being an a priori
                  selected non-negative integer. In the range of K
                  corresponding to minimum homogeneity, the degree
                  distribution of the networks shows
                  scaling. Clustering in these networks is smaller
                  than that in equivalent random graphs and remains
                  zero till significantly large K. Their small
                  diameter, however, implies small-world behavior
                  which is corroborated by their eigenspectrum. We
                  discuss implications of these findings in several
                  contexts.},
  comments =	 {I think the choice of the metric may be rather
                  important, unlike what the author suggests. The
                  author makes a network by bringing genes with the
                  most different expression (counter-expression)
                  together. The network turns out to have small-world
                  structure, and all the associated manipulations are
                  performed.},
}


@misc{thomas-etal-03,
  note =	 {arXiv:q-bio.MN/0309012},
  title =	 {On the structure of proten-protein interaction
                  networks},
  author =	 {Alun Thomas and Rob Cannings and Nicholas A. M. Monk
                  and Chris Cannings},
  url =		 {http://xxx.lanl.gov/abs/q-bio/0309012},
  pdf =		 {bionets/thomas-etal-03.pdf},
  entered_on =	 {03/17/04},
  abstract =	 {We present a simple model for the underlying
                  structure of protein-protein pairwise interaction
                  graphs that is based on the way in which proteins
                  attach to each other in experiments such as yeast
                  two-hybrid assays. We show that data on the
                  interactions of human proteins lend support to this
                  model. The frequency of the number of connections
                  per protein under this model does not follow a power
                  law, in contrast to the reported behaviour of data
                  from large scale yeast two-hybrid screens of yeast
                  protein-protein interactions. Sampling sub-graphs
                  from the underlying graphs generated with our model,
                  in a way analogous to the sampling performed in
                  large scale yeast two-hybrid searches, gives degree
                  distributions that differ subtly from the power law
                  and that fit the observed data better than the power
                  law itself. Our results show that the observation of
                  approximate power law behaviour in a sampled
                  sub-graph does not imply that the underlying graph
                  follows a power law.},
  comments =	 {A simple model is proposed to calculate the
                  in-degree statistics for interactomes -- completely
                  connected subnetworks in a protein interaction
                  network, and to explain their abundance. The details
                  of this very simple model are not very
                  important. What is important is the authors'
                  observation (which parallels my own ideas on the
                  subject) that the degree distribution for this
                  model, which very non-power law may look as a power
                  law due to sampling effects, when only a small part
                  of the network is uncovered. Maybe this is the
                  reason for the seeming power law abundance in
                  nature?},
}

@inproceedings{holme-huss-03,
  booktitle =	 {3rd Workshop on Computation of Biochemical Pathways
                  and Genetic Networks, European Media Lab
                  Proceedings},
  editor =	 {R. Gauges and U. Kummer and J. Pahle and U. Rost},
  publisher =	 {Logos},
  address =	 {Berlin},
  year =	 2003,
  pages =	 {3--9},
  title =	 {Discovery and analysis of biochemical subnetwork
                  hierarchies},
  author =	 {Petter Holme and Mikael Huss},
  abstract =	 {The representation of a biochemical network as a
                  graph is the coarsest level of description in
                  cellular biochemistry. By studying the network
                  structure one can draw conclusions on the large
                  scale organisation of the biochemical processes. We
                  describe methods how one can extract hierarchies of
                  subnetworks, how these can be interpreted and
                  further deconstructed to find autonomous
                  subnetworks. The large-scale organisation we find is
                  characterised by a tightly connected core surrounded
                  by increasingly loosely connected substrates.},
  url =		 {http://xxx.lanl.gov/abs/q-bio/0309011},
  pdf =		 {bionets/holme-huss-03.pdf},
  entered_on =	 {03/17/04},
  comments =	 {They talk about finding relevant biological
                  subnetworks, whil in reality the only look at
                  subgraphs. They search for "modules" (that is,
                  densely connected subnets that almost don't talk to
                  the outside), not just any subnetworks. An
                  interesting twist here is a bipartite graph, where
                  the one type of nodes goes for chemicals, and
                  another one for reactions they participate in. Other
                  than that, this is a somewhat ad hoc method for
                  graph feature finding, which solves a problem not
                  clearly defined and possibly not very useful.},
}


@misc{ziv-etal-03,
  note =	 {arXiv:cond-mat/0306610},
  title =	 {Novel systematic discovery of statistically
                  significant network features},
  author =	 {Etay Ziv and Robin Koytcheff and Chris Wiggins},
  abstract =	 {The physicist's desire to analyze in terms of local
                  structures, breaking systems into fundamental parts,
                  is uniquely thwarted by idealized networks: they are
                  composed of identical nodes, differentiated only by
                  the combinatorial explosion of possible
                  connections. The ideal reduced degrees of freedom --
                  the correct "local substructures" -- are not at all
                  obvious; we strive here to develop a systematic and
                  principled algorithm for their discovery. Functional
                  genomics and the development of modular biology
                  motivate a systematic, statistical approach to
                  identifying the most important features, functionals
                  of the adjacency matrix representation of the
                  graph. The features are global, involving all nodes
                  in each feature; although they can be related to
                  subgraph enumeration, the analysis does not require
                  hypothetical "most important" subgraphs. The
                  resulting algorithm provides an automated tool for
                  graph drawing and decomposition, and suggests novel
                  machine-learning techniques for network
                  classification.},
  url =		 {http://arxiv.org/abs/cond-mat/0306610},
  pdf =		 {bionets/ziv-etal-03.pdf},
  entered_on =	 {03/17/04},
  comments =	 {A couple of things are to be emphsizes. Even though
                  the authors talk about networks, in reality they
                  mean graphs, and only adjacency matrices are
                  important. I am not sure if the adjacency matrix is
                  a relevant quantity to describe underlying
                  physics/bilogy/sociology, or if one always has to
                  talk about the full dynamical network (there may be
                  an "edge", but it may be "very weak"). Second, when
                  the define "significant features," this is being
                  done relative to the background ensemble of random
                  networks. The choice of randomization is will
                  determine significance of a particular feature. It
                  is unclear to me that the randomization used in the
                  paper (preseving in-, out-, etc. degree) is the
                  correct way to proceed, even though "everybody does
                  this." One must understand what a particular
                  randomization procedure corresponds to. In general,
                  I believe that the approach suffers the problem that
                  it is a collection of ad hoc hacks, with little
                  fundamental justification for choosing either of
                  them. Algorithms (like the localization algorithm)
                  are a simple gradient descent algorithm, and, as
                  such, they are only approximate solution to some not
                  clearly specified problem. This paper introduces
                  "motif hubs."},
}

@article{perelson-weisbuch-97,
  title =	 {Immunology for physicists},
  author =	 {Alan S. Perelson and Gerard Weisbuch},
  abstract =	 {The immune system is a complex system of cells and
                  molecules that can provide us with a basic defense
                  against pathogenic organisms. Like the nervous
                  system, the immune system performs pattern
                  recognition tasks, learns, and retains a memory of
                  the antigens that it has fought. The immune system
                  contains more than 107 different clones of cells
                  that communicate via cell-cell contact and the
                  secretion of molecules. Performing complex tasks
                  such as learning and memory involves cooperation
                  among large numbers of components of the immune
                  system and hence there is interest in using methods
                  and concepts from statistical physics. Furthermore,
                  the immune response develops in time and the
                  description of its time evolution is an interesting
                  problem in dynamical systems. In this paper, the
                  authors provide a brief introduction to the biology
                  of the immune system and discuss a number of
                  immunological problems in which the use of physical
                  concepts and mathematical methods has increased our
                  understanding.},
  entered_on =	 {11/16/03},
  pdf =		 {bionets/perelson-weisbuch-97.pdf},
  journal =	 {Rev.\ Mod.\ Phys.},
  volume =	 69,
  number =	 {4},
  month =	 {Oct},
  year =	 {1997},
  pages =	 {1219--1267},
  comments =	 {A good review of immunology with physics
                  flavor. General observations: the field is quite
                  strange. There are some believable order of
                  magnitude estimates, but many of them are taken to
                  such precision levels, were I stop believing
                  them. Further, the assumed models seem to be quite
                  ad hoc (see e.g. page 19 for the shape of the
                  activation function, or the Cayley-tree model of the
                  immune network). It is difficult to see if the
                  results quoted are results valid in real life immune
                  systems, or if they are valid within the
                  hypothesized assumptions, reasonability of which is,
                  at best, questionable. Paper shows a theory which is
                  quite far from its experimental roots. Now some
                  specific questions. For the section on determining
                  the repertoire size: what is the a priori measure on
                  the space of shapes lymphocytes use in their
                  receptors to find antigens? What if the
                  "recognition ball" is in reality a zero--volume
                  fractal objects that covers the whole space? Similar
                  (but not such extreme) scenarios are common in
                  learning. For determining the length of the sequence
                  used for matching: a simpler argument exists. We
                  need the longest possible strings (to avoid
                  accidental self-matching). The size of the repertoire
                  is n=10^6..10^7, and each receptor is to be
                  different. For the alphabet size of 3 (uncharged and
                  positive/negative charged aminoacids) we need
                  r=log_3 n = 13 to index each receptor
                  individually. Done. For clone selection: why is this
                  a strictly uphill process in a spin-glass like
                  landscape? Won't accidental changes smear things and
                  create Metropolis-like maximization? Why does a two
                  clones model on page 20 have a symmetric coupling
                  matrix? For the description of requirements for a
                  immune network (p.23): why shouldn't the network
                  somewhat forget the previous antigens? There are
                  definitely analogies between Hopfield's and other
                  neural networks and the immune network. I quite
                  liked the calculation of the network capacity. But
                  why do they talk about spin glasses and other
                  symmetric interaction models? Symmetry is definitely
                  out of the question here.}
}


@Article{wall-etal-04,
  author =	 {Michael Wall and William Hlavacek and Michael
                  Savageau},
  title =	 {design of Gene Circuits: Lessons from Bacteria},
  journal =	 {Nature Rev.\ Genetics},
  year =	 {2004},
  pages =	 {to be published},
  abstract =	 {Researchers are now building synthetic circuits for
                  controlling gene expression and considering
                  practical applications of engineered gene
                  circuits. What can we learn from Nature about design
                  principles for gene circuits? A large body of
                  experimental data is now available to test some
                  important theoretical predictions of how gene
                  circuits could be organized but the data also raise
                  some intriguing new questions.},
  comments =	 {A good review of literature on construction of
                  simple regulatory networks: what are the building
                  blocks? Why should they be chosen given constraints
                  on robustness, stability, responsiveness?},
  entered_on =	 {11/25/03},
  pdf =		 {bionets/wall-etal-04.pdf},
}

@article{detwiler-etal-00,
  title =	 {Engineering Aspects of Enzymatic Signal
                  Transduction: Photoreceptors in the Retina},
  author =	 {Peter Detwiler and Sharad Ramanathan and Anirvan
                  Sengupta and Boris Shraiman},
  abstract =	 {Identifying the basic module of enzymatic
                  amplification as an irreversible cycle of messenger
                  activation deactivation by a push-pull pair of
                  opposing enzymes, we analyze it in terms of gain,
                  bandwidth, noise, and power consumption. The
                  enzymatic signal transduction cascade is viewed as
                  an information channel, the design of which is
                  governed by the statistical properties of the input
                  and the noise and dynamic range constraints of the
                  output. With the example of vertebrate
                  phototransduction cascade we demonstrate that all of
                  the relevant engineering parameters are controlled
                  by enzyme concentrations and, from functional
                  considerations, derive bounds on the required
                  protein numbers. Conversely, the ability of
                  enzymatic networks to change their response
                  characteristics by varying only the abundance of
                  different enzymes illustrates how functional
                  diversity may be built from nearly conserved
                  molecular components.},
  journal =	 {Biophys.\ J.},
  volume =	 79,
  month =	 {December},
  year =	 2000,
  pages =	 {2801--2817},
  entered_on =	 {04/04/03},
  comments =	 {This is definitely a paper worth reading. It
                  provides a comprehensive analysis of the
                  amplification cascade and proves a bunch of
                  inequalities that put bounds on different
                  performance aspects of the system. The results
                  include: relation between the time scale of the
                  response and gains (high gain means slow response);
                  noise and energy dissipation (lower noise means more
                  energy); cascade of amplifiers (amplifiers should be
                  about the same to minimize noise and maximize speed
                  of response). The paper also analyzes amplifiers
                  with feedback and puts bound on the minimal required
                  gain and the minimal messenger concentration so that
                  the effects of 1/sqrt(N) noise are smaller than the
                  the distinguishable changes in the output. However,
                  I slightly disagree with the style of the discussion
                  here, as well as in the adaptation section. In my
                  view, one should not count distinguishable states in
                  the signal to get the mutual information between the
                  input and the output of the amplifier. Instead, one
                  should calculate the mutual information between the
                  output of the amplifier and the outside world, and
                  then the conditional distribution of the input given
                  the outside world will involve the sqrt(N) effects
                  without specifically imposing them. The adaptation
                  section (where they claim imperfect adaptation) has
                  two serious flaws. First, imperfectness comes when
                  the speed of the amplifier is kept constant, which
                  is an unreasonable assumption, since at lower light
                  intensity it may be reasonable for the amplifier to
                  perform slower. The second problem: why should we
                  optimize the information flow through the amplifier,
                  rather than predictive information flow?
                  \cite{bnt-01}. Optimizing the latter one may create
                  an optimization problem for the best time scale for
                  the amplifier and explain the imperfectness of
                  adaptation! Finally, at the end the paper analyzes
                  the explicit implementation of the amplifier in the
                  phototransduction pathways, and shows that all of
                  the essential elements in the cascade can be
                  independently regulated to achieve better
                  performance of the network. I am slightly lost in
                  the biological data in these sections. Finally
                  Eq. (51) in the Appendix D seems mysterious to me.},
  pdf =
                  {bionets/detweiler-ramanathan-sengupta-shraiman-00.pdf},
}

@article{paulsson-04,
  year =         2004,
  title =	 {Summing up the noise in gene networks},
  author =	 {J Paulsson},
  abstract =	 {Random fluctuations in genetic networks are
                  inevitable as chemical reactions are probabilistic
                  and many genes, RNAs and proteins are present in low
                  numbers per cell. Such ‘noise’
                  affects all life processes and has recently been
                  measured using green fluorescent protein (GFP). Two
                  studies show that negative feedback suppresses
                  noise, and three others identify the sources of
                  noise in gene expression. Here I critically analyse
                  these studies and present a simple equation that
                  unifies and extends both the mathematical and
                  biological perspectives. },
  journal =	 {Nature},
  volume =	 427,
  number =	 29,
  pages =	 {415-418},
  pdf =		 {bionoise/paulsson-04.pdf},
  comments =	 {Supplement is <a
                  href="bionoise/paulsson-04_s1.pdf">here</a>.
                  Analyses many different experiments on measuring
                  noise in single cells and reinterpretes the data
                  arguing that the noise may, in fact, be coming not
                  from the sources that it is contributed to
                  naively. Omega expansion to the second
                  order. Separation of intrinsic vs. extrinsic noise
                  (multiplied by susceptibility and filtered
                  out). Note on sub- or super- Poisson nature of
                  self-regulating loops, and the effect of
                  autoregulation on intrinsic noise, filtering time,
                  and susceptibility to extrinsic noise. Not sure how
                  to get Eq. S5.}
}

                  
                  
@article{thattai-oudenaarden-01,
  title =	 {Intrinsic noise in gene regulatory networks},
  author =	 {Mukund Thattai and van Oudenaarden, Alexander},
  abstract =	 {Cells are intrinsically noisy biochemical reactors:
                  low reactant numbers can lead to significant
                  statistical fluctuations in molecule numbers and
                  reaction rates. Here we use an analytic model to
                  investigate the emergent noise properties of genetic
                  systems. We find for a single gene that noise is
                  essentially determined at the translational level,
                  and that the mean and variance of protein
                  concentration can be independently controlled. The
                  noise strength immediately following single gene
                  induction is almost twice the final steady-state
                  value. We find that fluctuations in the
                  concentrations of a regulatory protein can propagate
                  through a genetic cascade; translational noise
                  control could explain the inefficient translation
                  rates observed for genes encoding such regulatory
                  proteins. For an autoregulatory protein, we
                  demonstrate that negative feedback efficiently
                  decreases system noise. The model can be used to
                  predict the noise characteristics of networks of
                  arbitrary connectivity. The general procedure is
                  further illustrated for an autocatalytic protein and
                  a bistable genetic switch. The analysis of intrinsic
                  noise reveals biological roles of gene network
                  structures and can lead to a deeper understanding of
                  their evolutionary origin.},
  pages =	 {8614-8619},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  year =	 2001,
  volume =	 98,
  number =	 15,
  entered_on =	 {12/06/04},
  pdf =		 {bionoise/thattai-oudenaarden-01.pdf},
  comments =	 {Good references collection (5-10) on analytical
                  treatment of noise in biomolecular networks (many by
                  Berg and Paulsson). A very simple linear noise
                  analysis of noisy biochemical networks. Useful, and
                  algebra is painlessly easy. Notice that
                  discrepancies on fig 4d are triviially correctable
                  -- the system switches between two steady states,
                  and such analysis is easily done (see, e.g., van
                  Kampen). Brief discussion of tradeoff between
                  inefficiency (production of many untranslated mRNAs)
                  and low protein noise variance. Get reference
                  \cite{berg-78}.},
}

@article{berg-78,
  title =	 {A MODEL FOR THE STATISTICAL FLUCTUATIONS OF PROTEIN
                  NUMBERS IN A MICROBIAL POPULATION },
  author =	 {Berg, OG},
  year =	 1978,
  journal =	 {J. Theor. Biol.},
  volume =	 71,
  pages =	 {587--603},
}
@article{buchler-etal-03,
  title =	 {On schemes of combinatorial transcription logic},
  author =	 {N Buchler and U Gerland and T Hwa},
  abstract =	 {Cells receive a wide variety of cellular and
                  environmental signals, which are often processed
                  combinatorially to generate specific genetic
                  responses. Here we explore theoretically the
                  potentials and limitations of combinatorial signal
                  integration at the level of cisregulatory
                  transcription control. Our analysis suggests that
                  many complex transcription-control functions of the
                  type encountered in higher eukaryotes are already
                  implementable within the much simpler bacterial
                  transcription system. Using a quantitative model of
                  bacterial transcription and invoking only specific
                  proteinDNA interaction and weak glue-like
                  interaction between regulatory proteins, we show
                  explicit schemes to implement regulatory logic
                  functions of increasing complexity by appropriately
                  selecting the strengths and arranging the relative
                  positions of the relevant protein-binding DNA
                  sequences in the cis-regulatory region. The
                  architectures that emerge are naturally modular and
                  evolvable. Our results suggest that the
                  transcription regulatory apparatus is a
                  "programmable" computing machine, belonging formally
                  to the class of Boltzmann machines. Crucial to our
                  results is the ability to regulate gene expression
                  at a distance. In bacteria, this can be achieved for
                  isolated genes via DNA looping controlled by the
                  dimerization of DNA-bound proteins. However, if
                  adopted extensively in the genome, long-distance
                  interaction can cause unintentional intergenic cross
                  talk, a detrimental side effect difficult to
                  overcome by the known bacterial
                  transcription-regulation systems. This may be a key
                  factor limiting the genome-wide adoption of complex
                  transcription control in bacteria. Implications of
                  our findings for combinatorial transcription control
                  in eukaryotes are discussed.},
  journal =	 {Proc. Natl. Acad. Sci. USA},
  pages =	 {5136--5141},
  volume =	 100,
  number =	 9,
  year =	 2003,
  entered_on =	 {02/02/05},
  comments =	 {The simplest way to get an XOR gate is to let two
                  TFs to form an inert complex, but not stuff on
                  Fig. 3. Similarly, one can implement the EQ=NOT XOR
                  gate. I don't get the argument that this is
                  "harware," while binding strength is software. I am
                  afraid, I am not getting the point of this paper. Is
                  it that one can make arbitrary complex regulation
                  logic by adjusting bindings at TF binding sites? Why
                  is this surprizing?},
}

@article{gerland-etal-02,
  title =	 {Physical constraints and functional characteristics
                  of transcription factor=DNA interaction},
  year =	 2002,
  journal =	 {Proc Natl. Acad. Sci. USA},
  volume =	 99,
  number =	 19,
  pages =	 {12015-12020},
  author =	 {U Gerland and JD Moroz and T Hwa},
  abstract =	 {We study theoretical design principles for
                  transcription factor (TF)DNA interaction in
                  bacteria, focusing particularly on the statistical
                  interaction of the TFs with the genomic background
                  (i.e., the genome without the target sites). We
                  introduce and motivate the concept of
                  programmability, i.e., the ability to set the
                  threshold concentration for TF binding over a wide
                  range merely by mutating the binding sequence of a
                  target site. This functional demand, together with
                  physical constraints arising from the thermodynamics
                  and kinetics of TFDNA interaction, leads us to a
                  narrow range of "optimal" interaction parameters. We
                  find that this parameter set agrees well with
                  experimental data for the interaction parameters of
                  a few exemplary prokaryotic TFs, which indicates
                  that TFDNA interaction is indeed programmable. We
                  suggest further experiments to test whether this is
                  a general feature for a large class of TFs.},
  pdf =		 {biophysics/gerland-etal-02.pdf},
  entered_on =	 {02/02/05},
  comments =	 {In many respects, the paper overlaps with the recent
                  work of Mirny et al. (Notice modeling background
                  binding as a random energy model). Here they seem to
                  suggest that the TF binding sequence is an
                  adjustable parameter, and the deisgn principle of
                  TFs might be precisely such that one can have a very
                  wide range of specific binding probabilities (as
                  opposed to nonspecific) as a function of the
                  particular binding sequence. This might be true, but
                  also might not be. They provide some bounds on when
                  and how this might happen, and provide some
                  anekdotal evidence that standard bacterial TFs may
                  have this "programmability" built in them, so that
                  there is a very wide range of possible binding
                  energies. In eq 6, Z for specific binding is
                  calculated with the assumption of random (sea)
                  probabilities of single nucleotides in the specific
                  binding regions. Relation of TF-DNA interactions to
                  random energy models, some references. Worth looking
                  into. References to von Hippel and Berg, need to be
                  read. Good discussion of the search of a binding
                  site by a TF. Design principle for control networks:
                  "maximal programmability" -- ability to change the
                  threshold binding by small amounts just by changing
                  the sequence.},
}

@unpublished{buchler-etal-05,
  title =	 {Nonlinear Protein Degradation and the Function of
                  Genetic Circuits},
  author =	 {N Buchler and U Gerland and T Hwa},
  note =	 {submitted},
  year =	 2005,
  abstract =	 {The functions of most genetic circuits require
                  sufficient degrees of cooperativity in the circuit
                  components. While mechanisms of cooperativity have
                  been studied most extensively in the context of
                  transcriptional initiation control, cooperativity
                  from other processes involved in the operation of
                  the circuits can also play important roles. In this
                  study, we examine a simple kinetic source of
                  cooperativity stemming from the nonlinear
                  degradation of multimeric proteins. Ample
                  experimental evidence suggests that protein subunits
                  can degrade less rapidly when associated in
                  multimeric complexes, an effect we refer to as
                  cooperative stability. For dimeric transcription
                  factors, this effect leads to a
                  concentration-dependence in the degradation rate
                  because monomers, which are predominant at low
                  concentrations, will be more rapidly degraded. Thus
                  cooperative stability can effectively widen the
                  accessible range of protein levels in vivo. Through
                  theoretical analysis of two exemplary genetic
                  circuits in bacteria, we show that such an increased
                  range is important for the robust operation of
                  genetic circuits as well as their evolvability. Our
                  calculations demonstrate that a few-fold difference
                  between the degradation rate of monomers and dimers
                  can already enhance the function of these circuits
                  substantially. These results suggest that
                  cooperative stability needs to be considered
                  explicitly and characterized quantitatively in any
                  systematic experimental or theoretical study of gene
                  circuits. },
  pdf =		 {biophysics/buchler-etal-05.pdf},
  url =		 {http://matisse.ucsd.edu/~hwa/pub/},
  entered_on =	 {02/01/05},
  comments =	 {This is a fairly straightforward paper. They show
                  that decreased decay rate of a transcription factor
                  due to formation of a protein complex results in the
                  circuit exhibiting the requested behavior
                  (bistability or oscillation for different circuits)
                  over a wider range of parameters. This is a bit
                  trivial. Consider a self-activating gene (which may
                  show stability). In the extreme, where decay rate of
                  a single protein is extremely high, and the decay
                  rate of the complex is 0, we will need an extreme
                  sensitivity of the expression as a function of the
                  concentration of the protein to show bistability in
                  the single protein case, while for the complex
                  bistability will be easy to obtain (will require
                  very weak association of singles into dimers; this
                  will create a problem with low copy numbers, as they
                  mention in Discussion). It's no wonder that simila
                  trends persist for finite, but unequal decay
                  rates. It's a bit more interesting (but still not
                  surprizing) that these trends hold for biologically
                  relevant parameter values (btw, paper has some good
                  references to what biologically relevant parameter
                  values may be). However, it is unclear what is
                  achieved by doing detailed mathematical analysis of
                  quite unrealistic circuits instead of doing a brief
                  order of magnitude estimate, as I suggested above.},
}

@article{bundschuh-hwa-02,
  journal =	 {Phys. Rev. E},
  volume =	 65,
  pages =	 {031903},
  year =	 2002,
  title =	 {Statistical mechanics of secondary structures formed
                  by random RNA sequences},
  author =	 {R. Bundschuh and T. Hwa},
  abstract =	 {The formation of secondary structures by a random
                  RNA sequence is studied as a model system for the
                  sequence-structure problem omnipresent in
                  biopolymers. Several toy energy models are
                  introduced to allow detailed analytical and
                  numerical studies. First, a two-replica calculation
                  is performed. By mapping the two-replica problem to
                  the denaturation of a single homogeneous RNA
                  molecule in six-dimensional embedding space, we show
                  that sequence disorder is perturbatively irrelevant,
                  i.e., an RNA molecule with weak sequence disorder is
                  in a molten phase where many secondary structures
                  with comparable total energy coexist. A numerical
                  study of various models at high temperature
                  reproduces behaviors characteristic of the molten
                  phase. On the other hand, a scaling argument based
                  on the external statistics of rare regions can be
                  constructed to show that the low-temperature phase
                  is unstable to sequence disorder. We performed a
                  detailed numerical study of the low-temperature
                  phase using the droplet theory as a guide, and
                  characterized the statistics of large-scale,
                  low-energy excitations of the secondary structures
                  from the ground state structure. We find the
                  excitation energy to grow very slowly (i.e.,
                  logarithmically) with the length scale of the
                  excitation, suggesting the existence of a marginal
                  glass phase. The transition between the
                  low-temperature glass phase and the high-temperature
                  molten phase is also characterized numerically. It
                  is revealed by a change in the coefficient of the
                  logarithmic excitation energy, from being disorder
                  dominated to being entropy dominated.},
  entered_on =	 {05/10/04},
  pdf =		 {biophysics/bundschuh-hwa-02.pdf},
  comments =	 {Very comprehensive and readbale paper. Good
                  bibliography on RNA folding. Discussion on
                  knots. Hartree equation binding description. Did not
                  follow computations in detail. This work assumes
                  "random" RNA. But is RNA really random? There's
                  evolution involved.},
}

@article{slutsky-etal-03,
  note =	 {arXiv: q-bio.BM/0310008},
  title =	 {The long reach of DNA sequence heterogeneity in
                  diffusive processes},
  author =	 {M Slutsky and M Kardar and L Mirny},
  journal =	 {Phys. Rev. E},
  volume =	 69,
  pages =	 061903,
  year =	 2004,
  abstract =	 {Many biological processes involve one dimensional
                  diffusion over a correlated inhomogeneous energy
                  landscape with a correlation length $\xi_c$. Typical
                  examples are specific protein target location on
                  DNA, nucleosome repositioning, or DNA translocation
                  through a nanopore, in all cases with $\xi_c\approx$
                  10 nm. We investigate such transport processes by
                  the mean first passage time (MFPT) formalism, and
                  find diffusion times which exhibit strong sample to
                  sample fluctuations. For a a displacement $N$, the
                  average MFPT is diffusive, while its standard
                  deviation over the ensemble of energy profiles
                  scales as $N^{3/2}$ with a large
                  prefactor. Fluctuations are thus dominant for
                  displacements smaller than a characteristic $N_c \gg
                  \xi_c$: typical values are much less than the mean,
                  and governed by an anomalous diffusion
                  rule. Potential biological consequences of such
                  random walks, composed of rapid scans in the
                  vicinity of favorable energy valleys and occasional
                  jumps to further valleys, is discussed.},
  pdf =		 {biophysics/slutsky-etal-03.pdf},
  entered_on =	 {07/20/04},
  comments =	 {Why is the correlation length of fluctuations
                  finite? I thought that some German group measured
                  exponentially decaying correlations in the DNA
                  sequences. Is it really a good approximation that
                  the fluctuations are normal? Ref 27 to random energy
                  model for protein folding may be worth looking into.},
}@incollection{cardy-99,
title = {Field Theory and Nonequilibrium Statistical mechanics},
author = {J Cardy},
year = 1999,
booktitle = {Troisieme Cycle de la Suisse Romande},
postscript = {books/cardy-99.ps},
comments={Doi-Peliti for chemical reactions.},
}

@book{walleczek-00,
title = {Self-Organized biological dynamics and nonlinear control},
editor = {J Walleczek},
year = 2000,
publisher = {Cambridge UP},
entered_on = {05/24/06},
comments = {A very decent collection},
}

@incollection{kaiser-00,
title = {External signals and internal oscillation dynamics: principal
                  aspects and response of stimulated rhythmic
                  processes},
author = {F Kaiser},
pages = {15-43},
crossref = {walleczek-00},
entered_on = {05/24/06},
comments ={Discusses driven non-linear systems, such as a Van der Pol (active)
                  oscillator or a double-well (passive) oscillator,
                  and the effect on their behavior that 
                  the external driving force has, including various
                  entrapments, influences of slow and fast signals,
                  etc. Analyses the SNR for such oscillators in
                  response to signals of different frequencies. I need
                  to catch up on 
                  the dynamical systems theory; references to reviews
                  on the subject.},
}


@incollection{larter-etal-00,
author = { R Larter and R Worth and B Speelman},
title = {Nonlinear dynamics in biochemical and biophysical systems:
                  from enzyme kinetics to epilepsy},
pages = {44-65},
entered_on ={05/24/06},
crossref= {walleczek-00},
comments={Studies nonlinear oscillations in he peroxidase-oxidase
                  reaction, and then epilepsy in a more-or-less
                  realistic dynamical systems context.},
}

@incollection{peng-etal-00,
author = {C-K Peng and J Hausdorff and A Goldberger},
title = {Fractal mechanism in neuronal control: human heartbeat and
                  gait dynamics in health and disease},
entered_on ={05/24/06},
pages = {66-96},
crossref= {walleczek-00},
comments = {Data mining of human heart beats and gait to find a
                  1/f-like spectrum. For the gait, there is an
                  argument made (experimentally supported) that its
                  fractal structure is due to time-keeping, rather
                  than due to control of gait to a known time intervals.},
}

@incollection{ding-etal-00,
author = {M Ding and Y Chen and J Kelso and B Tuller},
title ={Self-organizing dynamics in human sensorimotor coordination
                  and perception},
pages = {97-111},
entered_on = {05/24/06},
crossref={walleczek-00},
comments = {Analysis of the data of a human tapping on a knob in
                  anticipation of a periodic beep. Various
                  correlation-etc analysis is done to establish
                  scalings. Some model to predict the gap duration
                  between the beep and the tap is introduced as some
                  first order differential equation. },
}

@incollection{arkin-00,
author = {A Arkin},
title = {Signal processing by biochemical reaction networks},
pages = {112-144},
crossref={walleczek-00},
entered_on = {05/24/06},
pdf = {books/arkin-00.pdf},
comments = {Discussion of signal transduction properties of
                  biochemical networks. General philosophy. Examples
                  of some control elements, like voltage dividers, low
                  pass filters, band pass filters, futile cycle,
                  bistability, stochasticity in gene expression
                  (following \cite{mcadmas-arkin-97}); very good
                  collection of references.},
}

@incollection{gailey-00,
author = {P Gailey},
title = {Electrical signal detection and noise in systems with
                  long-range coherence},
pages = {147-172},
crossref ={walleczek-00},
entered_on = {05/24/06},
comments = {Discusses physical limits to sensing electric signals,
                  including noise in channel opening due to electric
                  fields, etc.},
}

@incollection{petty-00,
author = {H Petty},
title = {Oscillatory signals in migrating neutrophils: effects of time
                  varying chemical and electric fields},
crossref={walleczek-00},
entered_on ={05/24/06},
pages ={173-192},
comments = {Electrical fields detection paper.},
}

@incollection{walleczek-eichwald-00,
title = {Enzyme kimetics and nonlinear biochemical amplification in
                  response to static and oscillating magnetic fields},
author = {J Walleczek and C Eichwald},
crossref={walleczek-00},
pages ={193-215},
entered_on = {05/24/06},
comments = {Magnetic field affects free radicals, flipping their
                  spins, and not allowing for them to recombine, thus
                  changing the rates of forward/backward reactions.},
}
@incollection{engstrom-etal-00,
title = {Magnetic field sensitivity in the hippocampus},
author = {S Engstrom and S Bawin and W Adey},
crossref = {walleczek-00},
pages ={216-234},
entered_on = {05/24/06},
}

@incollection{moss-00,
title = {Stochastic resonance: looking forward},
author = {F Moss},
crossref = {walleczek-00},
entered_on = {05/24/06},
pages = {236-256},
pdf= {books/moss-00.pdf},
comments = {Good review of stochastic resonance and related
                  literature, both dynamic (junos between two wells),
                  and non-dynamic (noise-over-threshold). Formulas for
                  SNR in the spiking-if-over-threshold stochastic
                  resonance. Analysis of experimental data of
                  injecting noise into crayfish neurons and seeing
                  signal amplification for optimal noise levels.},
}

@incollection{bezrukov-vodyanoy-00,
title = {Stochastic resonance and small-amplitude signal transduction
                  in voltage-gated ion channels},
author = {S Bezrukov and I Vodyanoy},
pages = {257-280},
crossref={walleczek-00},
entered_on ={05/24/06},
comments ={Considering stochastic resonance in a neuron (or a
                  channel), where there is no threshold, but the
                  probability of firing (opening) is modulated
                  exponentially by the the signal+noise
                  combination. They get SNR for such systems for
                  different simple (sine, etc) signals. It would be
                  interesting to combine this with
                  \cite{bialek-zee-90} and with \cite{brenner-etal-02}.},
}

@incollection{astumian-00,
title = {Ratchets, rectifiers, and demons: the constructive role of
                  noise in free energy and signal transduction},
author = {R Astumian},
pages = {281-300},
crossref = {walleczek-00},
entered_on = {05/24/06},
comments = {Review of literature on thermal ratchets, Maxwell demons,
                  and biomolecular motors from the point of view of
                  statistical phsyics.},
}

@book{geisser-93,
  title =	 {Predictive Inference: {A}n introduction},
  entered_on =	 {12/22/05},
  author =	 {S Geisser},
  publisher =	 {Chapman \& Hall},
  address =	 {New York},
  year =	 1993,
  comments =	 {Unlike most statistics books, this book presents
                  statistics (mostly Bayesian), where inference of
                  parameters is not the Holy Grail, but rather a step
                  on a way to prediction.},
}


@book{durbin-etal-98,
  author =	 {R Durbin and S Eddy and A Krogh and G Mitchison},
  title =	 {Biological sequence analysis: {P}robabilistic models
                  of proteins and nucleic acids},
  publisher =	 {Cambridge UP},
  address =	 {Cambridge, UK},
  year =	 1998,
}

@book{pavlov-27,
  author =	 {IP Pavlov},
  year =	 1927,
  title =	 {Conditioned reflexes},
  address =	 {London},
  publisher =	 {Routledge and Kegan Paul},
  entered_on =	 {06/29/05},
}


@Book{cover-thomas-91,
  author =       {T Cover and J Thomas},
  title =        {Elements of information theory},
  year =         1991,
  publisher =    {John Wiley \& Sons},
  address =      {New York},
  edition  =     {2nd},
}

@Book{sw-49,
  author =       {CE Shannon and W Weaver},
  title =        {The mathematical theory of communication},
  year =         1949,
  publisher =    {The University of Illinois Press},
  address =      {Urbana},
}

@book{spikes,
  author =	 {F Rieke and D Warland and de Ruyter van Steveninck,
                  R and W Bialek},
  title =	 {Spikes: {E}xploring the neural code},
  publisher =	 {MIT Press},
  address =	 {Cambridge, MA},
  year =	 1997,
}

@book{gradshteyn-ryzhik-00,
  author =	 {I.\ S.\ Gradshteyn and I.\ M.\ Ryzhik},
  title =	 {Tables of integrals, series and products},
  publisher =	 {Academic Press},
  year =	 2000,
  edition =	 6,
  address =	 {Burlington, MA},
}


@book{larsen-marx-81,
  author =	 {Richard J.\ Larsen and Morris L.\ Marx},
  title =	 {An introduction to mathematical statistics and its
                  applications},
  publisher =	 {Prentice Hall},
  address =	 {Englewood Cliffs, NJ},
  year =	 1981,
  comments =	 {Standard statistics textbook.}
}

@book{wiener-49,
  title =	 {Extrapolation, interpolation, and smoothing of
                  stationary time series},
  author =	 {N Wiener},
  year =	 1949,
  publisher =	 {MIT Press \& John Wiley \& Sons},
  address =	 {New York},
}

@book{gardiner-04,
  title =	 {Handbook of Stochastic Methods for Physics,
                  Chemistry, and the Natural Sciences},
  author =	 {CW Gardiner},
  year =	 2004,
  publisher =	 {Springer},
  address =	 {New York},
  comments =	 {Great textbook and encycopedia on stochastic
                  methods},
}

                  
@book{van-kampen,
  title =	 {Stochastic Processes in Physics and Chemistry},
  author =	 {N.G. Van Kampen},
  publisher =	 {North Holland},
  edition =	 {3rd},
  year =	 2001,
}


@book{gray-91,
  title =	 {Entropy and Information Theory},
  author =	 {Robert M. Gray},
  year =	 1991,
  publisher =	 {Springer-Verlag},
  address =	 {New York},
  comments =	 {Probability theory heavy book. Have on sarek. he has
                  a very good exposition on KL divergence, its
                  properties, inequalities, applications to large
                  deviation theory, etc. Look around pages 109 in
                  sarek's pdf version. Generalized devergencis (non
                  existsent pdf's).},
  entered_on =	 {05/07/04},
}

@Book{bialek-92,
  editor =	 {William Bialek},
  title =	 {Princeton Lecturs on Biophysics},
  publisher =	 {World Scientific, Singapore},
  year =	 {1992},
  entered_on =	 {02/19/04},
  comments =	 {Very much recommended.}
}

@InCollection{bialek-92a,
  booktitle =	 {Princeton Lecturs on Biophysics},
  author =	 {William Bialek},
  title =	 {Optimal signal processing in the nervous system},
  crossref =	 {bialek-92},
  pages =	 {321--401},
  entered_on =	 {02/27/04},
  comments =	 {Very nice review of optimization principles. Need to
                  follow up on many of these ideas. In particular: (1)
                  Refs. 98, 99 -- derivation of the photoreceptor size
                  in the ocmpoun eye design. (2) Estimating the
                  sampling time step for saccading motion. (3)
                  Cochlear compromise -- Refs. 23, 117. (4) Maps in
                  the cortex as optimal -- Refs 34,68,90,106. (5)
                  Optimal perfromance in bat echolocation,
                  Ref. 97. (5) Human detection of symmetry,
                  Ref. 102. Good points on the need to do things
                  carefully, including considering statistics of
                  natural images. Discussion of (1) single photon
                  detection in retina and bacteria, Ref 63; (2)
                  matching filters in optimal analog signal
                  preprocessing, and comparison to processing in
                  bipolar cells (Rieke's thesis, refs 76,78); (3)
                  optimal filters and receptive fields to maximize
                  information transmission by many pizels in retina
                  (with Zee and Ruderman), other useful
                  stuff. Finally, on pages 369-370 there is some early
                  discussion of optimizing predictive information with
                  some hints at (unmaterizalized) results. References
                  to fly trajectories papers. Discussion of SNR in fly
                  photoreceptors and comaprison to optimal detection
                  of motion by a fly. It's interesting to note that
                  this chapter deals with optimality mostly in small
                  SNR regime. Funny, but a year ago Shraiman told me
                  that small intensity adaptation in animals is easy;
                  they just loose optimality for large SNR, when
                  optimality is less important. What a change of
                  expectations! }
}



@InCollection{simmons-92,
  booktitle =	 {Princeton Lecturs on Biophysics},
  author =	 {James Simmons},
  title =	 {Time-frequence transfrorms and images of targets in
                  the sonar of bats},
  crossref =	 {bialek-92},
  pages =	 {291--320},
  entered_on =	 {02/27/04},
  comments =	 {Complete review of bat echolocation
                  experiments. Some points that the bat operates at
                  the threshold -- it distinguishes echoes as best as
                  possible given their frequency/phase
                  structure. Points on how popoulation coding may be
                  used to code extremely recise (microseconds!)
                  structures in the echosignal.}
}

@InCollection{hong-etal-92,
  booktitle =	 {Princeton Lecturs on Biophysics},
  author =	 {Mi Hong and Robert Austin and John Plombin and
                  Robert Callendar},
  title =	 {Collective Modes in Bacteriorhodopsin},
  crossref =	 {bialek-92},
pages = {1--26},
  entered_on =	 {02/19/04},
  comments =	 {Did not follow thoroughly.}
}



@incollection{loppnow-92,
  booktitle =	 {Princeton Lecturs on Biophysics},
  author =	 {Glen Loppnow},
  title =	 {The first picosecond oof vision: Raman studies of
                  rhodopsin},
  crossref =	 {bialek-92},
  pages =	 {27--69},
  entered_on =	 {02/19/04},
  comments =	 {Good intro. Did not follow closely.},
}

@incollection{regan-etal-92,
  booktitle =	 {Princeton Lecturs on Biophysics},
  author =	 {J Regan and J Betts and D Beratan and J Onuchic},
  title =	 {Finding electron transfer pathways},
  crossref =	 {bialek-92},
  pages =	 {175-197},
  entered_on =	 {02/19/04},
  comments =	 {Didn't quite understand this one.}
}

@incollection{chan-92,
  booktitle =	 {Princeton Lecturs on Biophysics},
  author =	 {Hue Sun Chan and Ken Dill and David Shortle},
  title =	 {Statistical mechanics and protein folding},
  crossref =	 {bialek-92},
  pages =	 {69--174},
  entered_on =	 {02/19/04},
  comments =	 {Didn't follow closely.},
}

@incollection{kruglyak-92,
  booktitle =	 {Princeton Lecturs on Biophysics},
  author =	 {Leonid Kruglyak},
  title =	 {Physical constraints and optimal signal processing
                  in bacterial chemotaxis},
  pages =	 {197--222},
  crossref =	 {bialek-92},
  entered_on =	 {02/19/04},
  keywords =	 {idea!},
  comments =	 {Recommended. Analysis of optimality of bacterial
                  chemotactic behavior. The optimality criterion is
                  that the bacterias, whos stationary distribution is
                  $P (x) \propto \exp[\beta c(x)]$, where $c$ is the
                  attractor concentration, is maximized when $c$ is
                  maximal. That is, $\beta$ must be minimized. $beta$
                  depends on the velocity and the statistics of
                  tumbles. Two effects fight not allowing $beta$ to go
                  to zero -- the animal must integrate for long enough
                  to get reliable concentration estimates, and if it
                  integrates for too long, its direction of motion
                  changes due to Brownian effects, and estimating the
                  concentration gradient has to be started anew. The
                  paper derives a linear filter which bacteria should
                  use for estimating gradients to optimize $\beta$ and
                  does some comparison with data. Two things are
                  missing from this arguments, I think: how fast will
                  the stationary distribution be established? If
                  $c(x)$ varies fast, won't the bacterias overshoot
                  their maximas? To combine these considerations into
                  a single optimization principle, we will need to
                  know the statistics of the concentrations. Then, I
                  think, the ideas of maximizing predictive
                  information should be able to form a single,
                  cohesive optimization principle, which will take
                  both the speed and the quality into the account on
                  the same footing. }
}

@incollection{atick-92,
  crossref =	 {bialek-92},
  author =	 {Joseph Atick},
  pages =	 {223--290},
  title =	 {Could information theory provide an ecological
                  theory of sensory processing?},
  entered_on =	 {02/19/04},
  keywords =	 {idea!},
  comments =	 {Information theoretic efficienty in encoding the
                  outside world: remove redundancy in the ouputs
                  (minimize mutual info) while keeping their entropy
                  enough to represent the entropy of the world;
                  whitening of input signals by retina (theory and
                  experiment); cutoffs in filters due to noise; color
                  redundancy separating filters; analysis of
                  information-theoretic ideas for optimal coding of
                  sensory information (following Laughlin's 1981
                  argument). Optimal gain for a single sigla or for
                  many correlated signals is analysed. Paper has many
                  references which may turn out useful for studying
                  statistics of synaptic connections in the eye, like
                  the reference to Sterling , 1990, in the collection
                  "The synaptic organization of the brain" from Oxford
                  UP. Good discussiong of dependence of the type of
                  color coding used in animal as a function of the
                  environment they live in, might be relevant for my
                  predictive information maximization ideas.},
}


@Book{cox-hinkley-74,
  author =	 {D.~R.~Cox and D.~V.~HInkley},
  title =	 {Theoretical statistics},
  publisher =	 {Chapman and Hall},
  year =	 {1974},
  address =	 {London},
  entered_on =	 {06/02/03},
  comments =	 {The book is supposed to have many derivations
                  regarding Dirichlet priors on pp.371--372 (according
                  to \cite{hall-hannan-88}).},
}

@article{block-etal-82,
  title =	 {Impulse responses in bacterial chemotaxis},
  journal =	 {Cell},
  volume =	 31,
  pages =	 {215-226},
  year =	 1982,
  author =	 {S Block and J Segall and H Berg},
  pdf =		 {chemotaxis/block-etal-82.pdf},
  abstract =	 {The chemotactic behavior of E. Coli has been studied
                  by exposing cells tethered by a single flagellum to
                  pulses of chemicals delivered
                  ionophoretically. Normally, wild-type cells spin
                  alternatively CW and CCW, changing their direction
                  on the average },
  comments =	 {},
}

@article{korobkova-etal-04,
  title =	 {From molecular noise to behavioural variability in a
                  single bacterium},
  author =	 {E Korobkova and T Emonet and JMG Vilar and TS
                  Shimizu and P Cluzel},
  journal =	 {Nature},
  volume =	 438,
  pages =	 {574-578},
  year =	 2004,
  abstract =	 {The chemotaxis network that governs the motion of
                  Escherichia coli has long been studied to gain a
                  general understanding of signal
                  transduction. Although this pathway is composed of
                  just a few components, it exhibits some essential
                  characteristics of biological complexity, such as
                  adaptation and response to environmental
                  signals(1). In studying intracellular networks, most
                  experiments and mathematical models(2-5) have
                  assumed that network properties can be inferred from
                  population measurements. However, this approach
                  masks underlying temporal fluctuations of
                  intracellular signalling events. We have inferred
                  fundamental properties of the chemotaxis network
                  from a noise analysis of behavioural variations in
                  individual bacteria. Here we show that certain
                  properties established by population measurements,
                  such as adapted states, are not conserved at the
                  single-cell level: for timescales ranging from
                  seconds to several minutes, the behaviour of
                  non-stimulated cells exhibit temporal variations
                  much larger than the expected statistical
                  fluctuations. We find that the signalling network
                  itself causes this noise and identify the molecular
                  events that produce it. Small changes in the
                  concentration of one key network component suppress
                  temporal behavioural variability, suggesting that
                  such variability is a selected property of this
                  adaptive system.},
  pdf =		 {chemotaxis/korobkova-etal-04.pdf},
  comments =	 {See als the <a
                  href="chemotaxis/korobkova-etal-04-suppl.pdf">supplement</a>. I
                  believe that the explanation by
                  \cite{tu-grinstein-05} is basically correct. This is
                  the same old phenomena explained by Paul Horn -- 1/f
                  as an exponent of a uniform distribution.},
}



@article{cluzel-etal-00,
  title =	 {An Ultrasensitive Bacterial Motor Revealed by
                  Monitoring Signaling Proteins in Single Cells},
  author =	 {P Cluzel and M Surette and S Leibler},
  abstract =	 {Understanding biology at the single-cell level
                  requires simultaneous measurements of biochemical
                  parameters and behavioral characteristics in
                  individual cells. Here, the output of individual
                  flagellar motors in Escherichia coli was measured as
                  a function of the intracellular concentration of the
                  chemotactic signaling protein. The concentration of
                  this molecule, fused to green fluorescent protein,
                  was monitored with fluorescence correlation
                  spectroscopy. Motors from different bacteria
                  exhibited an identical steep input-output relation,
                  suggesting that they actively contribute to signal
                  amplification in chemotaxis. This experimental
                  approach can be extended to quantitative in vivo
                  studies of other biochemical networks.},
  journal =	 {Science},
  year =	 2000,
  volume =	 287,
  pages =	 {1652-1655},
  pdf =		 {chemotaxis/cluzel+al_00.pdf},
  comments =	 {Interestingly, it looks like the motor does not
                  adapt -- all of the different concentrations and
                  concentration sweeps seem to fall on the same
                  "ultrasensitive" curve. But is it really so? What
                  was the temporal statistics of CheYP in the cell?
                  Was it constant (with the sample variance of 24\% of
                  the mean), or is each cell fluctuating over this
                  range? What happens if IPTG concentration is lower
                  than the lowest we used? Will we see a motor
                  adaptation? Is it possible to monitor CheYP and bias
                  at the same time for long time periods?},
}

@article{tu-grinstein-05,
  title =	 {How white noise generates power-law switching in
                  bacterial flagellar motors},
  author =	 {Yuhai Tu and G Grinstein},
  journal =	 {Phys. Rev. Lett.},
  abstract =	 {The clockwise (CW) or counter clockwise (CCW)
                  spinning of a bacterial flagellar motor is
                  controlled by the concentration, [Y ], of a
                  phosphorylated protein, CheY-P. Representing the
                  stochastic switching behavior of the motor by a
                  dynamical two-state (CW and CCW) model, whose energy
                  levels fluctuate in time (t) as [Y ](t) fluctuates,
                  we show that generic white noise in the [Y ](t)
                  fluctuations can generate a power-law distribution
                  for the durations of the CCW states, in agreement
                  with recent experiments. Correlations between the
                  duration times of nearby CCW (CW) intervals are
                  predicted by our model, and shown to exist in the
                  experimental data and to affect the power spectrum
                  for motor switching.},
  year =	 2005,
  pdf =		 {chemotaxis/tu-grinstein-05.pdf},
  entered_on =	 {05/11/05},
  comments =	 {Defines a model of (almost) uniform distribution of
                  CheYP, and then, since the correlation length of
                  CheYP is long, the distribution of the switch times
                  becomes a power law. I think that instead of
                  numerics, much of the stuff in this paper could
                  actually be calculated. Further, it is unclear that
                  CheYP fluctuations must have fine variance and
                  finite correlation time.},
}
 
@article{berg-00,
  url =		 {http://www.physicstoday.org/jan00/berg.htm},
  author =	 {H Berg},
  title =	 {Motile Behavior of Bacteria},
  year =	 2000,
  month =	 {Jan},
  journal =	 {Physics Today},
  pages =	 {Unknown},
  comments =	 {Quick overview of chemotaxis.},
  entered_on =	 {02/22/05},
}

@article{barkai-leibler-97,
  title =	 {Robustness in simple biochemical networks},
  author =	 {N Barkai and S Leibler},
  abstract =	 {Cells use complex networks of interacting molecular
                  components to transfer and process
                  information. These "computational devices of living
                  cells" are responsible for many important cellular
                  processes, including cell-cycle regulation and
                  signal transduction. Here we address the issue of
                  the sensitivity of the networks to variations in
                  their biochemical parameters. We propose a mechanism
                  for robust adaptation in simple signal transduction
                  networks. We show that this mechanism applies in
                  particular to bacterial chemotaxis. This is
                  demonstrated within a quantitative model which
                  explains, in a unified way, many aspects of
                  chemotaxis, including proper responses tochemical
                  gradients. The adaptation property is a consequence
                  of the network sconnectivity and does not require
                  the "fine-tuning" of parameters. We argue that the
                  key properties of biochemical networks should be
                  robust in order to ensure their proper functioning.},
  journal =	 {Nature},
  year =	 1997,
  volume =	 387,
  pages =	 {913-917},
  pdf =		 {chemotaxis/barkai-leibler-97.pdf},
  entered_on =	 {02/04/05},
  comments =	 {A model is introduced (and numerically tested) which
                  achieves perfect adaptation by means of a negative
                  feedback loop that couples CheB to active receptors,
                  so that demethylation of the receptors depends on
                  the activity level of the receptor complex. This
                  system achieves robust adaptation. That is, the
                  quality (but not the speed, and not the stationary
                  value) of adaptation depend very weakly on various
                  kinetic constants and concentrations of
                  chemicals. Is the 1% change in the ligand enough to
                  notice it by counting molecules? (According to the
                  model, this should produce the drift speed of 1
                  micron per second). In general, what happens if we
                  introduce stochastics to this model?},
}

@article{alon-etal-99,
  author =	 {U Alon and M Surette and N Barkai and S Leibler},
  title =	 {Robustness in bacterial chemotaxis},
  journal =	 {Nature},
  year =	 1999,
  volume =	 397,
  pages =	 {168--171},
  pdf =		 {chemotaxis/alon-etal-99.pdf},
  entered_on =	 {02/04/05},
  abstract =	 {Networks of interacting proteins orchestrate the
                  responses of living cells to cariety of external
                  stimuli, but how sensitive is the functioning of
                  these protein networks to variations in their
                  biochemical parameters? One possibility is that to
                  achieve appropriate function, the reaction rate
                  constants and enzyme concentrations need to be
                  adjusted in a precise manner, and any deviation from
                  these 'fine-tuned' values ruins the network's
                  performance. An alternative possibility is that key
                  properties of biochemical networks are robust; that
                  is, they are insensitive to the precise values of
                  the biochemical parameters. Here we address this
                  issues in experiments using chemotaxis of
                  Escherichia coli, one of the best-characterized
                  sensory systems. We focus on how response and
                  adaptation to attractant signals vary with
                  systematic changes in the intracellular
                  concentration of the components of the chemotaxis
                  network. We find that some properties, such as
                  steady-state behavior and adaptation time, show
                  strong variations in response to varying protein
                  concentrations. In contrast, the precision of
                  adaptation is robust and does not vary with the
                  pretein concentrations. This is consistent with a
                  recently proposed molecular mechanism for exact
                  adaptation, where robustness is a direct consequence
                  of the network's architecture.},
  comments =	 {Complete model of E. Coli chemotaxis is written
                  down. If I understand it correctly, increase in the
                  rate of attractor binding, leads to decrease in
                  phosphorylation of CheY, leading to less
                  tumbles. However, very quuickly the rate of
                  phosporylation of CheY is restored by CheR/B through
                  (de)metilation of the receptors, and the tumbling rate
                  increases. This metilation happens because CheA
                  itself phosporylates CheB; thus high levels of
                  activity of CheA immediately lead to high levels of
                  activity of CheB, and, therefore, to demethilation
                  of receptors. Thus to continue tumbling fast, the
                  attractant concentration must be growing; the
                  bacteria must be swimming up the gradient. The paper
                  tests the model of \cite{barkai-leibler-97}, and
                  robustness of the adaptation seems to be, in fact,
                  observed. However, adaptation times vary
                  drastically. Can the cell use this to adpat to the
                  variance of the signal?},
}

@article{dorsogna-etal-03,
  note =	 {arXiv: cond-mat/0309386},
  title =	 {Interplay of chemotaxis and chemokinesis mechanisms
                  in bacterial dynamics},
  author =	 {M D'Orsogna and M Suchard and T Chou},
  journal =	 {Phys. Rev. E},
  volume =	 68,
  pages =	 021925,
  year =	 2003,
  url =		 {http://arxiv.org/abs/cond-mat/0309386},
  pdf =		 {chemotaxis/dorsogna-etal-03.pdf},
  abstract =	 { Motivated by observations of the dynamics of {\it
                  Myxococcus xanthus}, we present a self-interacting
                  random walk model that describes the competition
                  between chemokinesis and chemotaxis. Cells are
                  constrained to move in one dimension, but release a
                  chemical chemoattractant at a steady state. The
                  bacteria senses the chemical that it produces. The
                  probability of direction reversals is modeled as a
                  function of both the absolute level of
                  chemoattractant sensed directly under each cell as
                  well as the gradient sensed across the length of the
                  cell. If the chemical does not degrade or diffuse
                  rapidly, the one dimensional trajectory depends on
                  the entire past history of the trajectory. We derive
                  the corresponding Fokker-Planck equations, use an
                  iterative mean field approach that we solve
                  numerically for short times, and perform extensive
                  Monte-Carlo simulations of the model. Cell
                  positional distributions and the associated moments
                  are computed in this feedback system. Average drift
                  and mean squared displacements are found. Crossover
                  behavior among different diffusion regimes are
                  found.},
  entered_on =	 {07/19/04},
  comments =	 {The paper deals with chemotaxis/chemokinesis. The
                  paper is fairly inventive computationally, but in
                  order to solve equations, they have to do a bunch of
                  simplifications: 1d, no diffusion, zero size of
                  bacteria, mean field, etc. Consequently, most of the
                  exact results are not directly applicable; things
                  which are clearly correct can be explained on the
                  level of handwaving arguments, and the rest are
                  probably too dependent on detailed assumptions to be
                  relevant. No long range RG analysis is done, and the
                  rest probably depends too much on the microscopic
                  details. Section B, chemotaxis: is it clearly true
                  that the concentration gradient over the cell length
                  drives chemotaxis switching rate? I though cell
                  integrates over large distances.},
}

@article{purcell-76,
  title =	 {Life at Low Reynolds Number},
  author =	 {E.~M.~Purcell},
  journal =	 {American Journal of Physics},
  volume =	 {45},
  pages =	 {3--11},
  year =	 {1977},
  abstract =	 {This is a reprint of a (slightly edited) paper of
                  the same title that appeared in the book Physics and
                  Our World: A Symposium in Honor of Victor
                  F. Weiskopf, published by the American Journal of
                  Physics (1976). The personal tone of the original
                  talk has been preserved in the paper, which was
                  itself a slightly edited transcript of a tape. The
                  figures reproduce transparencies used in the
                  talk. The demonstration involved a tall rectangular
                  transparent vessel of corn syrup, projected by an
                  overhead projector turned on its side. Some
                  essential hand waving could not be reproduced.},
  entered_on =	 {06/02/03},
  pdf =		 {chemotaxis/purcell-76.pdf},
  comments =	 {A classical paper. A must read for everyone. In a
                  very clear, easy, yet deep way the paper analyses
                  the ways how a bacteria could swim in a perfectly
                  laminar fluid that surrounds it.},
}


@article{rappel-etal-02,
  journal =	 {Biophysical J.},
  volume =	 83,
  month =	 {Sept},
  year =	 {2002},
  pages =	 {1361--1367},
  title =	 {Establishing Direction during Chemotaxis in
                  Eukaryotic Cells},
  author =	 {Wouter-Jan Rappel and Peter J.\ Thomas and Herbert
                  Levine and William F. Loomisa},
  abstract =	 { Several recent studies have demonstrated that
                  eukaryotic cells, including amoeboid cells of
                  Dictyostelium discoideum and neutrophils, respond to
                  chemoattractants by translocation of PH-domain
                  proteins to the cell membrane, where these proteins
                  participate in the modulation of the cytoskeleton
                  and relay of the signal. When the chemoattractant is
                  released from a pipette, the localization is found
                  predominantly on the proximal side of the cell. The
                  recruitment of PH-domain proteins, particularly for
                  Dictyostelium cells, occurs very rapidly ( 2
                  s). Thus, the mechanism responsible for the first
                  step in the directional sensing process of a cell
                  must be able to establish an asymmetry on the same
                  time scale. Here, we propose a simple mechanism in
                  which a second messenger, generated by local
                  activation of the membrane, diffuses through the
                  interior of the cell, suppresses the activation of
                  the back of the cell, and converts the temporal
                  gradient into an initial cellular
                  asymmetry. Numerical simulations show that such a
                  mechanism is plausible. Available evidence suggests
                  that the internal inhibitor may be cGMP, which
                  accumulates within less than a second following
                  treatment of cells with external cAMP.},
  entered_on =	 {09/18/03},
  pdf =		 {chemotaxis/rappel_etal_02.pdf},
  comments =	 {The paper numerically analyzes a specific model for
                  directional sensing in eucariotic cells. The model
                  involves elements of "local activation--global
                  inhibition" and "first arrival time" scenarios
                  discussed in \cite{devreotesa-janetopoulos-03}, and
                  identifies cGMP as the global inhibitor in the
                  scenario. From Fig. 3(a) it is clear that the cell
                  does not actually sense the constant spatial
                  gradient, instead it responds to a temporary
                  gradient in the chemical concentration, basically
                  polarizing the cell in the direction of the first
                  chemical arrival, and then using global inhibition
                  to preserve the state. I have no idea whether this
                  is a biologically relevant behavior: do cells sense
                  first arrival times and lock to them? or do they
                  actually sense gradients? or both? This being the
                  "first arrival" approach, I have some trouble
                  understanding how the continuous PDE can be used to
                  study its behavior: an accidental molecule hitting
                  the cell on the back side before the main front
                  reaches it in the front may polarize cell in the
                  completely wrong direction. I would prefer to see
                  fluctuations analysis done, or turning the PDE model
                  into stochastic PDE's. Further, even though the
                  authors mention that their analysis should be viewed
                  as analyzing perturbations above the steady state
                  level, I do not see how it works out. In particular,
                  the comment of \cite{devreotesa-janetopoulos-03}, I
                  think, still holds: with such a set of PDE's, this
                  cell will not be able to sense direction if the
                  chemical's concentration is being lowered on one
                  side from the uniformely high level (rather then
                  being rased). Finally, there may be issues with the
                  experimental setup the authors suggest to test their
                  model: introduce chemical from two pipettes on
                  different sides of the cell, with the drop on one
                  side being released some time after the other. They
                  claim that the mechanism will produce a sigmoidal
                  form of the asymmetry ratio as a function of the
                  delay between releazes (no asymmetry in activation
                  for small delays, and vice versa). In my
                  understanding, <em>any</em> reasonable direction
                  sensitivity mechanism will have such response. They
                  will only differ in the details of the sigmoid
                  curve. And the authors did not investigate how
                  robust is the sigmoid of their model to the choice
                  of parameters.}
}

@article{devreotesa-janetopoulos-03,
  journal =	 {J.\ Biolog.\ Chem.},
  volume =	 278,
  number =	 23,
  month =	 {June},
  pages =	 {20445--20448},
  year =	 2003,
  title =	 {Eukaryotic Chemotaxis: Distinctions between
                  Directional Sensing and Polarization},
  author =	 {Peter Devreotesa and Chris Janetopoulos},
  abstract =	 {Directional sensing and polarization are fundamental
                  cellular responses that play a central role in
                  health and disease. In this review we define each
                  process and evaluate a series of models previously
                  proposed to explain these phenomena. New findings
                  show that directional sensing by G protein-coupled
                  receptors is localized at a discrete step in the
                  signaling pathway downstream of G protein activation
                  but upstream of the accumulation of PIP3. Local
                  levels of PIP3, whether triggered by
                  chemoattractants, particle binding, or spontaneous
                  events, determine the sites of new actin-filled
                  projections. Robust control of the temporal and
                  spatial levels of PIP3 is achieved by reciprocal
                  regulation of PI3K and PTEN. These observations
                  suggest that a local excitation-global inhibition
                  model can account for the localization of PI3K and
                  PTEN and thereby explain directional
                  sensing. However, elements of other models,
                  including positive feedback and the reaction of the
                  cytoskeleton, must be invoked to account for
                  polarization.},
  entered_on =	 {09/18/03},
  pdf =		 {chemotaxis/devreotesa-janetopoulos-03.pdf},
  comments =	 {This is a mini-review of mechanisms involved in
                  Eucariotic chemotaxis. The authors suggest a bunch
                  of phenomenological models responsible for direction
                  sensitivity and argue that only the "local
                  excitation -- global inhibition" model makes sense
                  given the experimental data. They review the actual
                  biological pathways that may be responsible for the
                  process, and discuss the directional actin
                  polymerization as involved in reacting to the
                  directional response and amplifying the response. It
                  would be interesting to see how the fluctuatins in
                  the number of incoming chemicals may influence some
                  of the conclusions (a-la berg and Purcell molecule
                  counting). Further, it's still somewhat unclear to
                  me if they argue that the cells response to a
                  chemical gradient, or if they response to a temporal
                  gradient (first arrival). Finally, it's probably
                  useful to think if the cell motility in presence or
                  absence of chemicals can be viewed as manifestation
                  of the same mechanism, where in the zero
                  concentration case cell exhibits random motion
                  because of the strongly amplified random nature of
                  fluctuations being input in its mobile unit, and in
                  the nonzero concentration case, the cell response to
                  a (a lot less amplified) real signal. <p> One should
                  think about what is the realtion of the sensing
                  mechanisms described here to various polarity
                  establishing phenomena. Can polarity (say, planar
                  cell polarity)},
}@Article{jeffreys46,
  author =	 {Harold Jeffreys},
  title =	 {An invariant form for the prior probability in
                  estimation problems},
  journal =	 {Proc.\ R.\ Soc., Ser.\ A, Math.\ and Phys.\ Sci.},
  year =	 {1946},
  volume =	 {186},
  number =	 {1007},
  pages =	 {453-461},
  url =		 {choosing_prior/jeffreys-46.pdf},
  abstract =	 {It is shown that a certian differential form
                  depending on the values of the parameters in a law
                  of chance is invariant for all transformations of
                  the parameters when the law is differentiable with
                  regards to all parameters. For laws containing a
                  location and a scale parameter a form with a somwhat
                  restricted type of invariance is found even when the
                  law is not everywhere differentiable with regard to
                  the parameters. This form has the properties
                  required to give a general rule for stating the
                  prior probability in a large class of estimation
                  problems.},
  comments =	 {What is now known as the Jeffreys' prior is
                  derived.},
}

@InProceedings{bilmes-etal-01,
  author =	 "J. Bilmes and G. Ji and M. Meila",
  title =	 "Intransitive Likelihood-Ratio Classifiers",
  booktitle =	 "Advances in Neural Information Processing Systems
                  14",
  editor =	 "T. G. Dietterich and S. Becker and Z. Ghahramani",
  publisher =	 "MIT Press",
  address =	 "Cambridge, MA",
  year =	 "2002",
  pdf =		 {classify/bilmes-etal-01.pdf},
  entered_on =	 {10/29/03},
  abstract =	 {In this work, we introduce an information-theoretic
                  based correction term to the likelihood ratio
                  classification method for multiple classes. Under
                  certain conditions, the term is sufficient for
                  optimally correcting the difference between the true
                  and estimated likelihood ratio, and we analyze this
                  in the Gaussian case. We find that the new
                  correction term significantly improves the
                  classification results when tested on medium
                  vocabulary speech recognition tasks. Moreover, the
                  addition of this term makes the class comparisons
                  analogous to an intransitive game and we therefore
                  use several tournament-like strategies to deal with
                  this issue. We find that further small improvements
                  are obtained by using an appropriate
                  tournament. Lastly, we find that intransitivity
                  appears to be a good measure of classification
                  confidence.},
  comments =	 {I spend a couple of months trying to figure out what
                  happens in this paper, thinking that the "great
                  results" may hold some deep knowledge. I am happy to
                  be proven wrong, however, my current conclusion is
                  that this paper is quite wrong. In particular,
                  section 2 (the derivation of the method) makes no
                  sense to me.}
} @article{fraley-raftery-02,
  title =	 {Model-Based Clustering, Discriminant Analysis, and
                  Density Estimation},
  author =	 {Chris Fraley and Adrian Raftery},
  abstract =	 {Cluster analysis is the automated search for groups
                  of related observations in a data set. Most
                  clustering done in practice is based largely on
                  heuristic but intuitively reasonable procedures, and
                  most clustering methods available in commercial
                  software are also of this type. However, there is
                  little systematic guidance associated with these
                  methods for solving important practical questions
                  that arise in cluster analysis, such as how many
                  clusters are there, which clustering method should
                  be used, and how should outliers be handled. We
                  review a general methodology for model-based
                  clustering that provides a principled statistical
                  approach to these issues. We also show that this can
                  be useful for other problems in multivariate
                  analysis, such as discriminant analysis and
                  multivariate density estimation. We give examples
                  from medical diagnosis, minefield detection, cluster
                  recovery from noisy data, and spatial density
                  estimation. Finally, we mention limitations of the
                  methodology and discuss recent developments in
                  model-based clustering for non-Gaussian data, high
                  dimensional data sets, large data sets, and Bayesian
                  estimation.},
  journal =	 {J. Am. Stat. Assoc.},
  year =	 2002,
  volume =	 97,
  number =	 458,
  pages =	 {611-641},
  pdf =		 {clustering/fraley-raftery-02.pdf},
  comments =	 {Bayesian model selection used to evaluate the
                  appropriate number of clusters.},
}
@article{still-bialek-04,
  author =	 {S Still and W Bialek},
  title =	 {How many clusters? {A}n information-theoretic
                  perspective},
  journal =	 {Neural Comp.},
  year =	 2004,
  pdf =		 {clustering/still-bialek-04.pdf},
  abstract =	 {Clustering provides a common means of identifying
                  structure in complex data, and there is renewed
                  interest in clustering as a tool for the analysis of
                  large data sets in many fields. A natural question
                  is how many clusters are appropriate for the
                  description of a given system. Traditional
                  approaches to this problem are based on either a
                  framework in which clusters of a particular shape
                  are assumed as a model of the system or on a
                  two-step procedure in which a clustering criterion
                  determines the optimal assignments for a given
                  number of clusters and a separate criterion measures
                  the goodness of the classification to determine the
                  number of clusters. In a statistical mechanics
                  approach, clustering can be seen as a trade-off
                  between energy- and entropy-like terms, with lower
                  temperature driving the proliferation of clusters to
                  provide a more detailed description of the data. For
                  finite data sets, we expect that there is a limit to
                  the meaningful structure that can be resolved and
                  therefore a minimum temperature beyond which we will
                  capture sampling noise. This suggests that
                  correcting the clustering criterion for the bias
                  that arises due to sampling errors will allow us to
                  find a clustering solution at a temperature that is
                  optimal in the sense that we capture maximal
                  meaningful structure - without having to define an
                  external criterion for the goodness or stability of
                  the clustering. We show that in a general
                  information-theoretic framework, the finite size of
                  a data set determines an optimal temperature, and we
                  introduce a method for finding the maximal number of
                  clusters that can be resolved from the data in the
                  hard clustering limit.},
  volume =	 16,
  number =	 12,
  pages =	 {2483--2506},
  comments =	 {Corrections to the IB loss function due to small
                  errors in estimation of the underlying probability
                  distribution are considreded. It's shown that such
                  corrections lead to a control of the number of
                  admissible clusters.},
}

@INPROCEEDINGS{friedman-etal-01,
  AUTHOR =	 "Nir Friedman and Ori Mosenzon and Noam Slonim and
                  Naftali Tishby",
  TITLE =	 "Multivariate Information Bottleneck",
  BOOKTITLE =	 "Uncertainty in Artificial Intelligence: Proceedings
                  of the Seventeenth Conference (UAI-2001)",
  PUBLISHER =	 "Morgan Kaufmann Publishers",
  ADDRESS =	 "San Francisco, CA",
  YEAR =	 "2001",
  PAGES =	 "152-161",
  entered_on =	 {04/27/04},
  abstract =	 {The Information bottleneck method is an unsupervised
                  non-parametric data organization technique. Given a
                  joint distribution , this method constructs a new
                  variable that extracts partitions, or clusters, over
                  the values of that are informative about . The
                  information bottleneck has already been applied to
                  document classification, gene expression, neural
                  code, and spectral analysis. In this paper, we
                  introduce a general principled framework for
                  multivariate extensions of the information
                  bottleneck method. This allows us to consider
                  multiple systems of data partitions that are
                  inter-related. Our approach utilizes Bayesian
                  networks for specifying the systems of clusters and
                  what information each captures. We show that this
                  construction provides insight about bottleneck
                  variations and enables us to characterize solutions
                  of these variations. We also present a general
                  framework for iterative algorithms for constructing
                  solutions, and apply it to several examples.},
  pdf =		 {clustering/friedman_etal_01.ps},
}


@incollection{parker-etal-03,
  author =	 "Albert E. Parker and Tomas Gedeon and Alexander
                  G. Dimitrov",
  title =	 "Annealing and the Rate Distortion Problem",
  booktitle =	 "Advances in Neural Information Processing Systems
                  15",
  editor =	 "S. Becker, S. Thrun and K. Obermayer",
  publisher =	 "MIT Press",
  address =	 "Cambridge, MA",
  pages =	 "969--976",
  year =	 "2003",
  entered_on =	 {10/29/03},
  abstract =	 {In this paper we introduce methodology to determine
                  the bifurcation structure of optima for a class of
                  similar cost functions from Rate Distortion Theory,
                  Deterministic Annealing, Information Distortion and
                  the Information Bottleneck Method. We also introduce
                  a numerical algorithm which uses the explicit form
                  of the bifurcating branches to find optima at a
                  bifurcation point.},
  pdf =		 {clustering/parker-etal-03.pdf},
  comments =	 {},
}


@TechReport{gilad-bachrach-03,
  author =	 {Ran Gilad-Bachrach and Amir Navot and Naftali
                  Tishby},
  title =	 {An information theoretic tradeoff between complexity
                  and accuracy},
  institution =	 {Hebrew University, CS Department},
  year =	 {2003},
  postscript =	 {clustering/gilad-bachrach-etal-03.ps.gz},
  entered_on =	 {10/29/03},
  abstract =	 {A fundamental question in learning theory is the
                  quantification of the basic tradeoff between the
                  complexity of a model and its predictive
                  accuracy. One valid way of quantifying this
                  tradeoff, knows as the "Information Bottleneck", is
                  to measure both the complexity of the model and its
                  prediction accuracy by using Shannon's mutual
                  information. In this paper we show that the
                  Information Bottleneck framework answers a well
                  defined and known coding problem and at the same
                  time it provides a general relationship between
                  complexity and prediction accuracy, measured by
                  mutual information we study the nature of the
                  complexity-accuracy tradeoff and discuss some of its
                  theoretical properties. Furthermore, we present
                  relations to classical information theoretic
                  problems, such as rate-distortion theory,
                  cost-capacity tradeoff and source coding with side
                  information.},
  comments =	 {The following things are shown: similarity between
                  IB and source coding with side information;
                  sufficiency of \card(\tilde{X}) >= \card(X) for
                  optimal encoding; equivalence of IB and local
                  rate-distortion problems; IB coding theorem is
                  proven. My concerns here are that I don't exactly
                  see how the IB coding theorem is a conceptual
                  improvement over original \cite{ib-00}; though, of
                  course, this makes the original paper "cleaner" with
                  all the proofs of existence of codebooks, etc.}
}

@InProceedings{chechik-etal-03,
  author =	 {Gal Chechik and Amir Globerson and Naftali Tishby
                  and Yair Weiss},
  title =	 {Information bottleneck for {G}aussian variables},
  booktitle =	 "Advances in Neural Information Processing Systems",
  volume =	 16,
  editor =	 {Sebastian Thrun and Lawrence Saul and Bernhard
                  Scholkopf},
  publisher =	 "MIT Press",
  address =	 "Cambridge, MA",
  year =	 "2004",
  entered_on =	 {10/09/03},
  abstract =	 {The problem of extracting the relevant aspects of
                  data was addressed through the information
                  bottleneck (IB) method by (soft) clustering one
                  variable while preserving information about another
                  -- relevance -- variable. An interesting question
                  addressed in the current work is the extension of
                  these ideas to obtain continuous representations
                  (embeddings) that preserve relevant information,
                  rather than discrete clusters. We give a formal
                  definition of the general continuous IB problem and
                  obtain an analytic solution for the optimal
                  representation for the important case of
                  multivariate Gaussian variables. The obtained
                  optimal representation is a noisy linear projection
                  to eigenvectors of the normalized correlation matrix
                  $\Sigma_{x|y} \Sigma_x^{-1}$, which is also the basis
                  obtained in the Canonical Correlation
                  Analysis. However, in Gaussian IB, the compression
                  tradeoff parameter uniquely determines the
                  dimension, as well as the scale of each
                  eigenvector. This introduces a novel interpretation
                  where solutions of different ranks lie on a
                  continuum parameterized by the compression
                  level. Our analysis also provides analytic
                  expression for the optimal tradeoff -- the
                  information curve -- in terms of the eigenvalue
                  spectrum},
  postscript =	 {clustering/chechik-etal-03.ps.gz},
  comments =	 {First (to my knowledge) continuous form of the
                  IB. Compression of a Gaussian is, not surprizingly,
                  a wider Gaussian. Depending on $\beta$ the rank of
                  the compression variable changes, and only those
                  eigenvectors of the matrix of y conditional on x
                  participate in copression, for which their
                  eigenvector is $<1-1/\beta$. The active eigenvectors
                  are turned on smoothly at their respective critical
                  $\beta$'s. Basically, IB just takes eigendirections
                  and smoothes them out, projects them (increases
                  variance for them); the fattest (least informative)
                  directions disappear the first. The information
                  curve is smooth, and it consists of a bunch of
                  logarithmic pieces (exact form is available). It is
                  interesting to check if a bunch of these logarithmic
                  pieces may look like a power--law scaling we are
                  working on.},
} 


@article{blatt-etal-97,
  author =	 {Marcelo Blatt and Shai Wiseman and Eytan Domany},
  note =	 {{LANL} eprint, cond-mat/9702072},
  title =	 {Super-paramagnetic clustering of data},
  journal =	 {Phys. Rev. Lett.},
  year =	 1996,
  volume =	 76,
  pages =	 {3251--3254},
  pdf =		 {clustering/blatt_wiseman_domany_97.pdf},
  url =		 {http://arxiv.org/abs/cond-mat/9702072},
  entered_on =	 {04/09/03},
  abstract =	 {We present a new approach to clustering, based on
                  the physical properties of an inhomogeneous
                  ferromagnet. No assumption is made regarding the
                  underlying distribution of the data. We assign a
                  Potts spin to each data point and introduce an
                  interaction between neighboring points, whose
                  strength is a decreasing function of the distance
                  between the neighbors. This magnetic system exhibits
                  three phases. At very low temperatures it is
                  completely ordered; i.e. all spins are aligned. At
                  very high temperatures the system does not exhibit
                  any ordering and in an intermediate regime clusters
                  of relatively strongly coupled spins become ordered,
                  whereas dierent clusters remain uncorrelated. This
                  intermediate phase is identied by a jump in the
                  order parameters. The spin-spin correlation function
                  is used to partition the spins and the corresponding
                  data points into clusters. We demonstrate on three
                  synthetic and three real data sets how the method
                  works. Detailed comparison to the performance of
                  other techniques clearly indicates the relative
                  success of our method.},
  comments =	 {This is a cute physical model of clustering. But, so
                  what? There are many different clustering
                  algorithms, and none is better then the rest. One
                  should be thinking about first principle for
                  clustring, for the measure used for it, instead of
                  having yet another clustering procedure.},
}

@Unpublished{ib-00,
  author =	 {Naftali Tishby and Fernando Pereira and William
                  Bialek},
  title =	 {Information bottleneck method},
  note =	 {{LANL} eprint, physics/0004057},
  year =	 {2000},
  month =	 {Apr},
  pdf =		 {clustering/bottleneck.pdf},
  url =		 {http://arxiv.org/abs/physics/0004057},
  entered_on =	 {07/23/03},
  abstract =	 {We define the relevant information in a signal $x\in
                  X$ as being the information that this signal
                  provides about another signal $y\in \Y$. Examples
                  include the information that face images provide
                  about the names of the people portrayed, or the
                  information that speech sounds provide about the
                  words spoken. Understanding the signal $x$ requires
                  more than just predicting $y$, it also requires
                  specifying which features of $\X$ play a role in the
                  prediction. We formalize this problem as that of
                  finding a short code for $\X$ that preserves the
                  maximum information about $\Y$. That is, we squeeze
                  the information that $\X$ provides about $\Y$
                  through a `bottleneck' formed by a limited set of
                  codewords $\tX$. This constrained optimization
                  problem can be seen as a generalization of rate
                  distortion theory in which the distortion measure
                  $d(x,\x)$ emerges from the joint statistics of $\X$
                  and $\Y$. This approach yields an exact set of self
                  consistent equations for the coding rules $X \to
                  \tX$ and $\tX \to \Y$. Solutions to these equations
                  can be found by a convergent re-estimation method
                  that generalizes the Blahut-Arimoto algorithm. Our
                  variational principle provides a surprisingly rich
                  framework for discussing a variety of problems in
                  signal processing and learning, as will be described
                  in detail elsewhere.},
  comments =	 {By now this is a classic paper that deals with
                  extracting information in one variable that is
                  relevant for another using information theoretic
                  ideas.},
}

@inproceedings{slonim-01,
  author =	 "N. Slonim and N. Friedman and N. Tishby",
  title =	 "Agglomerative Multivariate Information Bottleneck",
  booktitle =	 "Advances in Neural Information Processing Systems
                  14",
  editor =	 "T. G. Dietterich and S. Becker and Z. Ghahramani",
  publisher =	 "MIT Press",
  address =	 "Cambridge, MA",
  year =	 "2002",
  entered_on =	 {07/24/03},
  abstract =	 {The information bottleneck method is an unsupervised
                  non-parameteric data organization technique. Given a
                  joint distribution P(A,B), this method constructs a
                  new variable T that extracts partitions, or
                  clasters, over the values of A that are informative
                  about B. Ina recent paper, we introduced a general
                  principled framework for multivariate extensions of
                  the information bottleneck method that allows us to
                  consider multiple systems of data partition that are
                  inter-related. In this paper, we present a new
                  family of simple agglomerative algorithms to
                  construct such systems of inter-related clusters. We
                  analyze the behavio of these algorithms and apply
                  them to several real-life datasets.},
  postscript =	 {clustering/slonim_friedman_tishby_01.ps.gz},
  comments =	 {Another spin on the algorithmics of the bottleneck.},
} 

@inproceedings{slonim-tishby-2000,
  author =	 {Noam Slonim and Naftali Tishby},
  title =	 {Agglomerative information bottleneck},
  booktitle =	 {Advances in Neural Information Processing Systems
                  12},
  editor =	 {S. A. Solla, T. K. Leen, K.-R. Muller},
  publisher =	 {MIT Press},
  address =	 {Cambdridge, MA},
  year =	 2000,
  postscript =	 {clustering/slonim-tishby-00.ps.gz},
  abstract =	 {We introduce a novel distributional clustering
                  algorithm that explicitly maximizes the mutual
                  information per cluster between the data and given
                  categories. This algorithm can be considered as a
                  bottom up hard version of the recently introduced
                  ``Information Bottleneck Method''. We relate the
                  mutual information between clusters and categories
                  to the Bayesian classification error, which provides
                  another motivation for using the obtained clusters
                  as features. The algorithm is compared with the
                  top-down soft version of the information bottleneck
                  method and a relationship between the hard and soft
                  results is established. We demonstrate the algorithm
                  on the {\em 20 Newsgroups} data set. For a subset of
                  two news-groups we achieve compression by 3 orders
                  of magnitudes loosing only 10\% of the original
                  mutual information.},
  entered_on =	 {07/24/03},
  comments =	 {Bottleneck with hard clustering, an algorithmic
                  spin.},
}@article{badii-politi-97,
  journal =	 {Phys. Rev. Lett.},
  volume =	 78,
  pages =	 {444--447},
  year =	 1997,
  title =	 {Thermodynamics and Complexity of Cellular Automata},
  author =	 {Remo Badii and Antonio Politi},
  abstract =	 {The complexity exhibited by cellular automata is
                  studied using both topological (graph-theoretical)
                  and metric (thermodynamic) techniques. A novel
                  topological classification, based on a hierarchy of
                  languages, is introduced. In particular, it is shown
                  that the elementary rule 22 is able to produce, upon
                  iteration, a deep nesting of grammatical rules and
                  that this asymptotically yields a phase transition
                  when the thermodynamic formalism is applied to the
                  limit spatial configuration.},
  entered_on =	 {04/25/04},
  pdf =		 {complexity/badii-politi-97.pdf},
  comments =	 {Even earlier paper on the subject is
                  \cite{alessandro-politi-90}. The authors analyse the
                  exponent of the growth of the number of n-mers
                  coming from a source as a function of n. Then they
                  study the growth of the irreducible prohibited words
                  of length n from the same source, and then the
                  growth of prohibeted words among prohibited words,
                  etc., this way building up a hierarchy of
                  characterizations of complexity of the source. Don't
                  quite know how to connect this to other work.},
}

@article{alessandro-politi-90,
  journal =	 {Phys. Rev. Lett.},
  volume =	 64,
  pages =	 {1609--1612},
  year =	 1990,
  title =	 {Hierarchical approach to complexity with
                  applications to dynamical systems},
  author =	 {G Alessandro and A Politi},
  abstract =	 {A hierarchical approach to complexity of infinite
                  stationary strings of symbols is introduced by
                  investigating the scaling behavior of suitable
                  quantities. The topological entropy, which estimates
                  the growth rate of the number of admissible words,
                  corresponds to the first-order indicator C(1). At
                  the second level, a novel indicator C(2) is
                  introduced which measures the growth rate of the
                  number of irreducible forbidden words. Finally, a
                  detailed analysis of 2D maps reveals that C(2) can
                  be expressed in terms of the Lyapunov exponents.},
  entered_on =	 {04/25/04},
  comments =	 {See \cite{badii-politi-97}.},
  pdf =		 {complexity/alessandro-politi-90.pdf},
}


@article{greenfield-lecar-01,
  journal =	 {Phys. Rev. E},
  volume =	 63,
  pages =	 {041905},
  year =	 2001,
  title =	 {Mutual information in a dilute, asymmetric neural
                  network model},
  author =	 {E Greenfield and H Lecar },
  abstract =	 {Neural networks with asymmetric synaptic connections
                  (wijwji) display a broad range of dynamical behavior
                  including fixed point, periodic, and "chaotic"
                  trajectories. Previous work has shown that such
                  networks undergo an order-chaos phase transition as
                  various network parameters, such as the connectivity
                  or the degree of asymmetry, are changed. Here, using
                  an information theoretic approach, we present
                  results which suggest that neurons are able to
                  communicate information to each other most
                  effectively in networks that are near the
                  order-chaos transition. We then extend the model to
                  incorporate some biologically relevant features.},
  entered_on =	 {04/25/04},
  pdf =		 {complexity/greenfield-lecar-01.pdf},
  comments =	 {Mutual information between pre- and postsynaptic
                  neurons in a synthetic, binary, asymmetric recurrent
                  neural network is analyzed numerically and
                  analytically. It is found that this mutual
                  information is maximized when the whole network
                  undergoes regular to chaotic behavior transition
                  (some previous literature on the subject is listed;
                  good collection). This is much like the logistic map
                  at the threshold of chaos (Grassberger) and other
                  similar examples. The "converse" statement on page
                  2, paragraph 4 is hardly clear: why should the
                  mutual information discussed be large to perform a
                  useful complicated computations?},
}

@article{binder-plazas-01,
  author =	 {P-M Binder and J Plazas},
  title =	 {Multiscale analysis of complex systems},
  journal =	 {Phys. Rev. E},
  volume =	 63,
  pages =	 {065203(R)},
  entered_on =	 {04/26/04},
  abstract =	 {We calculate block information versus size profiles
                  for two-symbol strings generated by several
                  dynamical processes: random, periodic, regular
                  language, and substitutive. The profiles procie a
                  good diagnostics of the complexity of the string.},
  pdf =		 {complexity/binder-01.pdf},
  comments =	 {The authors should have cited \cite{bnt-01}, which
                  appeared on arXiv a couple of months before this
                  paper. Good collection of articles on various
                  definitions of complexity. The comment on page 2
                  that they have found a nonrandom deviation in the
                  random number generator is most probably wrong -- I
                  think, they just start to underestimate the entropy
                  for this word length and do not realize that their
                  estimator is biased \cite{nsb}.},
  year =	 2001,
}

@misc{li-etal-01,
  howpublished = {E-print cs.CC/0111054 and in Proc. 14th ACM-SIAM
                  Symp. Discrete Algorithms, 2003},
  year =	 2001,
  title =	 {The similarity metric},
  author =	 { Ming Li and Xin Chen and Xin Li and Bin Ma and Paul
                  Vitanyi},
  abstract =	 {A new class of metrics appropriate for measuring
                  effective similarity relations between sequences,
                  say one type of similarity per metric, is
                  studied. We propose a new ``normalized information
                  distance'', based on the noncomputable notion of
                  Kolmogorov complexity, and show that it minorizes
                  every metric in the class (that is, it is universal
                  in that it discovers all effective similarities). We
                  demonstrate that it too is a metric and takes values
                  in $[0,1]$; hence it may be called the {\em
                  similarity metric}. This is a theory foundation for
                  a new general practical tool. We give two
                  distinctive applications in widely divergent areas
                  (the experiments by necessity use just computable
                  approximations to the target notions). First, we
                  computationally compare whole mitochondrial genomes
                  and infer their evolutionary history. This results
                  in a first completely automatic computed whole
                  mitochondrial phylogeny tree. Secondly, we give
                  fully automatically computed language tree of 52
                  different language based on translated versions of
                  the ``Universal Declaration of Human Rights''.},
  url =		 {http://arxiv.org/abs/cs.CC/0111054},
  pdf =		 {complexity/li-etal-01.pdf},
  entered_on =	 {11/02/03},
  keywords =	 {NIPS02_UL},
  comments =	 {Good historical review. When all the dust settles,
                  very primitive "compression" tools are used as a
                  Kolmogorov complexity estimator for the input to the
                  similarity metric, very similar to the
                  "probablistic" setting (see a very similar paper by
                  Loreto and co-authors on arxiv.org). },
}

@incollection{vitanyi-li-01,
  author =	 {P Vitanyi and M Li},
  title =	 {Simplicity, Information, Kolmogorov Complexity, and
                  Prediction},
  pages =	 {135--155},
  booktitle =	 {Simplicity, Inference and Modelling},
  editor =	 {Arnold Zellner and Hugo A. Keuzenkamp and Michael
                  McAleer},
  publisher =	 {Cambridge University Press},
  address =	 {Cambridge, UK},
  year =	 {2001/2002},
  entered_on =	 {11/02/03},
  keywords =	 {NIPS02_UL},
  postscript =	 {complexity/vitanyi-li-01.ps.gz},
  abstract =	 {In contrast to statistical entropy which measures
                  the quantity of information in an average obkect of
                  a given probabilistic ensemble, Kolmogorov
                  complexity is the quantity of absolute information
                  in an individual objects. It is a novel notion of
                  randomness and resolves problems of probability
                  theory, statistical information theory, and
                  philosophy. It is widely believed, based pn
                  'Occams's razor' paradigm about 'simplicity', that
                  the better a theory compresses the data concerning
                  some phenomenon under investigation, the better we
                  have learned, generalizedm and the better the theory
                  predicts unknown data. This belief is vindicated in
                  practice but apprently has not been rigorously
                  proved in a general setting. Making these ideas
                  rigorous involves the length of the shortest
                  effective description of an individual object: its
                  Kolmogorov complexity. we treat the relation between
                  data compression and learning and show that
                  compression is almost always the best strategy, both
                  in hypotheses identification by using the minimum
                  description length (MDL) principle and in prediction
                  methods in the style of R. Solomonoff.},
  comments =	 {In many respects, this paper is superceeded by
                  \cite{vitanyi-01}. Some comments on Bayesian
                  inference are clearly wrong (eg, claimed inability
                  to have Occam factors build in).},
}

@misc{vitanyi-01,
  author =	 {Paul Vitanyi},
  title =	 {Randomness},
  howpublished = {E-print math.PR/0110086},
  year =	 2001,
  url =		 {http://www.arxiv.org/abs/math.PR/0110086},
  pdf =		 {complexity/vitanyi_01.pdf},
  entered =	 {10/31/03},
  abstract =	 {Here we present in a single essay a combination and
                  completion of the several aspects of the problem of
                  randomness of individual objects which of necessity
                  occur scattered in our texbook "An Introduction to
                  Kolmogorov Complexity and Its Applications" (M. Li
                  and P. Vitanyi), 2nd Ed., Springer-Verlag, 1997. },
  comments =	 {Definition of individual randomness for long strings
                  by means of algorithmic complexity. Applications and
                  methods. Very good collection of relevant quotes
                  from the "great ones". Kolmogorov: "In everyday
                  language we call random those phenomena where we
                  cannot find a regularity allowing us to predict
                  precisely their results." Contrast of
                  unpredictability and uncompressibility. Complexity
                  oscillations, Martin-Lof randomness. Chaos and
                  Kolmogorov complexity. },
}

@InProceedings{vitanyi-02,
  author =	 {Paul Vitanyi},
  title =	 {Meaningful Information},
  booktitle =	 {Proc.\ 13th International Symposium on Algorithms
                  and Computation (ISAAC), Lecture Notes in Computer
                  Science},
  volume =	 {???},
  publisher =	 {Springer-Verlag},
  address =	 {Berlin},
  year =	 2002,
  abstract =	 { The information in an individual finite object
                  (like a binary string) is commonly measured by its
                  Kolmogorov complexity. One can divide that
                  information into two parts: the information
                  accounting for the useful regularity present in the
                  object and the information accounting for the
                  remaining accidental information. There can be
                  several ways (model classes) in which the regularity
                  is expressed. Kolmogorov has proposed the model
                  class of finite sets, generalized later to
                  computable probability mass functions. The resulting
                  theory, known as Algorithmic Statistics, analyzes
                  the algorithmic sufficient statistic when the
                  statistic is restricted to the given model
                  class. However, the most general way to proceed is
                  perhaps to express the useful information as a
                  recursive function. The resulting measure has been
                  called the ``sophistication'' of the object. We
                  develop the theory of recursive functions statistic,
                  the maximum and minimum value, the existence of
                  absolutely nonstochastic objects (that have maximal
                  sophistication--all the information in them is
                  meaningful and there is no residual randomness),
                  determine its relation with the more restricted
                  model classes of finite sets, and computable
                  probability distributions, in particular with
                  respect to the algorithmic (Kolmogorov) minimal
                  sufficient statistic, the relation to the halting
                  problem and further algorithmic properties. },
  entered_on =	 {11/01/03},
  url =		 {http://xxx.lanl.gov/abs/cs.CC/0111053},
  pdf =		 {complexity/vitanyi-02.pdf},
  comments =	 {Approach akin to \cite{bnt-01}, but in the context
                  of algorithmic statistisc (single strings, not
                  averages). Two-part codes for meaningful part and
                  meaningless noise. Definition of "sufficient
                  statistics"; some applications. Definitely worth
                  reading.},
}

@misc{shalizi-moore-03,
  howpublished = {E-print cond-mat/0303625},
  title =	 {What Is a Macrostate? {S}ubjective Observations and
                  Objective Dynamics },
  author =	 {Cosma Rohilla Shalizi and Cristopher Moore},
  abstract =	 {We consider the question of whether thermodynamic
                  macrostates are objective consequences of dynamics,
                  or subjective reflections of our ignorance of a
                  physical system. We argue that they are both; more
                  specifically, that the set of macrostates forms the
                  unique maximal partition of phase space which 1) is
                  consistent with our observations (a subjective fact
                  about our ability to observe the system) and 2)
                  obeys a Markov process (an objective fact about the
                  system's dynamics). We review the ideas of
                  computational mechanics, an information-theoretic
                  method for finding optimal causal models of
                  stochastic processes, and argue that macrostates
                  coincide with the ``causal states'' of computational
                  mechanics. Defining a set of macrostates thus
                  consists of an inductive process where we start with
                  a given set of observables, and then refine our
                  partition of phase space until we reach a set of
                  states which predict their own future, i.e. which
                  are Markovian. Macrostates arrived at in this way
                  are provably optimal statistical predictors of the
                  future values of our observables.},
  url =		 {http://www.arxiv.org/abs/cond-mat/0303625},
  pdf =		 {complexity-philosophy/shalizi-moore-03.pdf},
  entered_on =	 {11/03/03},
  comments =	 {I quite agree with the main point of the paper
                  -- proper macrostate variables are the ones that
                  form minimal states, and the ones that agree with
                  experimental measurements, and describe them
                  well. Few points: in view of, for example, RG theory
                  -- can one actually search for a "minimal causal
                  architectures capable of producing the statistics of
                  observed time series"?  That is, can we uniquely
                  find microscopic dynamics responsible for
                  macroscopic effects? Further, paper cites
                  \cite{bnt-01}. However, the theory of computational
                  mechanics, studied here, is a lot closer related to
                  \cite{bnt-01} and, even more so, to
                  \cite{bnt_short-01} then the author cares to say
                  -- we also consider classes equivalent when they
                  form the same conditional pdf's of futures, we also
                  consider I(past, future), etc. Basically, the causal
                  states of the current paper are the minimal
                  sufficient statistics for the past observations. It
                  might be interesting to run bottleneck for their
                  extraction. Some nice viewpoints: a sequence of
                  causal states (CS) (minimal sufficient statistics)
                  of a system is a Markov sequence since each CS
                  contains all of relevant history. Thus causal states
                  and the microstates of the world for a hidden Markov
                  model. Of course, the state space of this Markov
                  process may be huge. },
}
@InProceedings{bisho-welch-01,
  author =	 {Gary Bishop and Greg Welch},
  title =	 {An Introduction to the Kalman Filter},
  booktitle =	 {SIGGRAPH 2001},
  address =	 {Los Angeles, CA},
  year =	 2001,
  month =	 {Aug},
  entered_on =	 {04/09/2003},
  pdf =		 {control/welch-bishop-01.pdf},
  abstract =	 {In putting together this course pack we decided not
                  to simply include copies of the slides for the
                  course presentation, but to attempt to put together
                  a small booklet of information that could stand by
                  itself. The course slides and other useful
                  information, including a new Java-based Kalman
                  Filter Learning Tool are available at
                  http://www.cs.unc.edu/~tracker/ref/s2001/kalman/ In
                  addition, we maintain a popular web site dedicated
                  to the Kalman lter. This site contains links to
                  related work, papers, books, and even some
                  software. http://www.cs.unc.edu/~welch/kalman/ We
                  expect that you (the reader) have a basic
                  mathematical background, sufficient to understand
                  explanations involving basic linear algebra,
                  statistics, and random signals.},
  comments =	 {A brief engineering introduction to Kalman filters.}
}


@article{besag-74,
  author =	 {J Besag},
  title =	 {Spatial interactions and the statistical analysis of
                  lattice systems},
  journal =	 {J Roy. Stat. Soc., Ser. B (Methodol.)},
  year =	 1974,
  pages =	 {192--236},
  volume =	 36,
  abstract =	 {The formulation of conditional probability models
                  for finite systems of spatially interacting random
                  variables is examined. A simple and alternative
                  proof of the Hammersley-Clifford theorem is
                  presented and the theorem is then used to construct
                  specific spatial schemes on and off the
                  lattice. Particular emphasis is placed upon
                  practical applications of the models in plant
                  ecology when the variates are binary or
                  gaussian. Some aspects of infinite lattice gaussian
                  processes are discussed. Methods of statistical
                  analysis for lattice schemes are proposed, including
                  a very flexible coding technique. The methods are
                  illustrated by two numerical examples. It is
                  maintained throughout that the conditional
                  probability approach to the specification and
                  analysis of spatial interactions is more attractive
                  than the alternative joint probability approach.},
  pdf =		 {dependence/besag-74.pdf},
  comments =	 {The Markov network description of
                  interactions. Proof of Hammersley-Clifford
                  theorem. In some sense, irreducibility of
                  interactions is equivalent to the statement in the
                  HC theorem that only terms that correspond to a
                  clique have nonzero potentials. However, the analogy
                  is incomplete -- in a system of three pairwise
                  interacting Ising spins, the conditional pdf of each
                  spin depends on the values of both other spins. So
                  they form a clique. However, the interactions among
                  spins are only pairwise, so we do not need the third
                  order G-function to be nonzero. Such three way
                  interaction is reducible to two-way ones.},
}

@inproceedings{hofmann-tresp-98,
  title =	 {Nonlinear Markov Networks for Continuous Variables},
  author =	 {Reimar Hofmann and Volker Tresp},
  editor =	 {unknown},
  pages =	 {521--527},
  publisher =	 {MIT Press},
  year =	 1998,
  address =	 {Cambridge, MA},
  booktitle =	 {Advances in Neur. Inf. Proc. Syst.},
  pdf =		 {dependence/hofmann-tresp-98.pdf},
  comments =	 {Again, no disctinction between the irreducible 3-way
                  interaction and lower order interactions is done},
}

@misc{wolf-95,
  howpublished = {arXiv: comp-gas/9511002},
  author =	 {D Wolf},
  title =	 {Mutual Information as a Bayesian Measure of
                  Independence},
  abstract =	 {The problem of hypothesis testing is examined from
                  both the historical and Bayesian points of view in
                  the case that sampling is from an underlying joint
                  probability distribution and the hypotheses tested
                  for are those of independence and dependence of the
                  underlying distribution. Exact results for the
                  Bayesian method are provided. Asymptotic Bayesian
                  results and historical method test quantities are
                  compared, and historical method quantities are
                  interpreted in terms of clearly defined Bayesian
                  quantities. The asymptotic Bayesian test relies upon
                  a statistic that is primarily mutual information.},
  pdf =		 {dependence/wolf-95.pdf},
  comments =	 {For Direchlet-1 priors for discrete distributions on
                  XxY and X and Y separately, one evaluates which of
                  the distributions (product, or not dependent) is
                  more a posteriori probable. This is a test for
                  independence, and the asymptotic relates to standard
                  chi^2 tests and to the mutual info of the sample.},
  entered_on =	 {08/25/2004},
}

@book{pearl-88,
  author =	 {J Pearl},
  title =	 {Probabilistic reasoning in intelligent systems:
                  networks of plausible inference},
  edition =	 {2nd},
  publisher =	 {Morgan Kaufmann},
  address =	 {San Francisco, CA},
  entered_on =	 {08/26/04},
  year =	 1988,
  comments =	 {A standard book on graphical models. Confirms my
                  suspicion that either Markov or bayesian networks
                  cannot distinguish the connected higher order
                  interactions. Also of interest is the discussion of
                  why people like to represent things in terms of
                  causal connections. According to Pearl, this is only
                  for computational simplicty; she seems to not
                  discuss the possibility that this description is
                  chosen because it represents objective reality. it
                  would be nice to use PIB techniques to show that
                  this particular representation is uniques, and
                  anybody doing inference would arrive at the same
                  causal representation because of the structure of
                  the underlying world.},
}



@article{heckerman-etal-00,
  author =	 {D heckerman and DM Chickering and C Meek and R
                  Rounthwaite and C kadie},
  title =	 {Dependency networks for inference, collaborative
                  filtering, and data vizualization},
  journal =	 {j. Mach. Learn. Res.},
  volume =	 1,
  year =	 2000,
  pages =	 {49--75},
  pdf =		 {dependence/heckerman-etal-00.pdf},
  abstract =	 {We describe a graphical model for probabilistic
                  relationships -- an alternative to the Bayesian
                  networks-- called a depedency network. The graph of
                  a dependency network, unlike a bayesian network, is
                  potentially cyclic. The probability component of a
                  dependency network, like a Bayesian network, is a
                  set of conditional distributions, one for each node
                  given its parents. We identify secveral basic
                  properties of this representation and describe a
                  computationally efficient proceure for learning the
                  graph and probability components from data, we
                  describe the application of this representation to
                  probabilistic inference, collaborative filtering
                  (the task of predicting preferences), and the
                  viualization of acausal predictive relationships.},
  comments =	 {The dependency network is defined by keeping a
                  conditional distribution of each node on its parents
                  (unlike BN's, loops are allowed), which later
                  results in a JPD by means of, for example, the Gibbs
                  sampler. It's proven that markov and dependence
                  networks are equivalent (can represent the same
                  dependencies); however, I don't see how these
                  dependency networks can distinguish two way
                  vs. higher order dependencies, for example. Some
                  references to the earlier literature might be
                  useful; then the paper marches into applications
                  orthogonal to my needs.},
}


@article{chow-liu-68,
  author =	 {CK Chow and CN Liu},
  title =	 {Approximating Discrete Probability distributions
                  with dependence trees},
  journal =	 {IEEE Trans. Inf. Thy.},
  volume =	 {IT-14},
  number =	 3,
  pages =	 {462--467},
  year =	 1968,
  abstract =	 {A method is presented to approximate optimally an
                  n-dimensional discrete probability distribution of
                  the first order tree dependence. The problem is to
                  find an optimum set of n-1 first order dependece
                  relationships among the n variables. It is shown
                  that the procedure derived in this paper yields and
                  approximation of a minimum difference in
                  information. It is further shown that when this
                  procedure is applied to empirical observations from
                  an unknown distribution of tree dependence, the
                  procedure is the maximum likelihood estimate of the
                  distribution.},
  comments =	 {Relevant to \cite{margolin-etal-04}. Constructs a
                  strict tree of maximum mutual infos (while ARACNE
                  may have loops). First order dependences in this
                  language are same as second order interactions in
                  mine.},
  entered_on =	 {11/05/2004},
}


@article{dawid-79,
  title =	 {Conditional independence in statistical theory},
  author =	 {AP Dawid},
  journal =	 {J. Roy. Stat. Soc. Ser. B (Methodol.)},
  volume =	 41,
  number =	 1,
  year =	 1979,
  pages =	 {1--31},
  entered_on =	 {07/23/04},
  pdf =		 {dependence/dawid-79.pdf},
  abstract =	 {Some simple heuristic properties of conditional
                  independence are shown to form a conceptual
                  framework for much of the theory of statistical
                  inference. this framework is illustrated by an
                  examination of the role of conditional independence
                  in several diverse areas of the field of
                  statistics. Topics covered include sufficiency and
                  ancillarity, parameter identification, causal
                  inference, prediction sufficiency, data slection
                  mechanisms, invariant statistical models, and a
                  subjective approach to model building.},
  comments =	 {Shows that much of statistics is building
                  conditional independence models.},
}

@article{studeny-89,
  author =	 {M Studeny},
  title =	 {Multiinformation and the problem of characterization
                  of conditional independence relations},
  journal =	 {Probl. Control Inf. Thy.},
  volume =	 18,
  year =	 1989,
  pages =	 {3--16},
  abstract =	 { Certain algebraic relation between multiinformation
                  and conditional mutual information is
                  established. It is shown to be applicable to the
                  problem of characterization of conditional
                  independence relations (= structures) arising in
                  connection with probabilistic expert systems. More
                  concretely, a new property (axiom) of these
                  relations is derived. Some auxiliary results have
                  their own significance: the characterization of
                  marginally continous measures and the
                  information-theoretical characterization of the
                  conditional product of measures.},
  entered_on =	 {07/21/04},
}
@article{hartmanis-59,
  journal =	 {Information and Control},
  volume =	 2,
  pages =	 {199--213},
  year =	 1959,
  title =	 {The application of some basic inequalities for
                  entropy},
  comments =	 {The author considers a a joint probability
                  distributions of consequetive symbols in a sequence
                  and studies which constraints are put on the
                  distribution of N symbols, if the distribution of
                  N-1 is known. This is done for Markovian sequences
                  of various depth.},
  author =	 {Juris Hartmanis},
}

@article{lewis-59,
  journal =	 {Information and Control},
  volume =	 2,
  pages =	 {214--225},
  year =	 1959,
  title =	 {Aproximating probability distributions to reduce
                  storage requirements},
  author =	 {P M Lewis},
  comments =	 {Suggests to use maxent consistent with marginals to
                  approximate the joint. Then looks at product
                  approximations, so that the approximation to the
                  joint is a product of marginals and conditionals of
                  this joint, and still satisfies marginal
                  constraints. The paper then selects maxent among
                  these products -- these avoids using I-projections
                  to find the true maxent.}
}

@article{brown-59,
  journal =	 {Information and control},
  volume =	 2,
  pages =	 {386--392},
  year =	 1959,
  title =	 {A note on approximations to discrete probability
                  distributions},
  author =	 {D T Brown},
  comments =	 {This paper introduces IPFP (I-projection)
                  independently of other papers that did it earlier.},
}

@article{deming-stephan-40,
  title =	 {On a least squares adjustment of a sampled frequency
                  table when the expected marginal totals are known},
  author =	 {WE Deming and FS Stephan},
  journal =	 {Ann. Math. Stat.},
  volume =	 11,
  number =	 4,
  year =	 1940,
  pages =	 {427-444},
  entered_on =	 {04/29/04},
  pdf =		 {dependence/deming-stephan-40.pdf},
  comments =	 {Iterative I-projection algorithm first introduced;
                  formulation is not information-theoretic.},
}

@article{roy-kastenbaum-56,
  title =	 {On the hypotheis of no "interaction" in a multi-way
                  contingency table},
  author =	 {SN Roy and MA Kastenbaum},
  journal =	 {Ann. Math. Stat.},
  volume =	 27,
  number =	 3,
  year =	 1956,
  pages =	 {749--757},
  pdf =		 {dependence/roy-kastenbaum-56.pdf},
  entered_on =	 {04/29/04},
  comments =	 {See also \cite{darroch-62}. the paper does not give
                  a definition of independence, but immediately starts
                  evaluating independence from data, confounding the
                  absence of definition and the sampling issues.},
}

@article{mosteller-68,
  title =	 {Association and estimation in contingency tables},
  author =	 {F Mosteller},
  journal =	 {J. Amer. Stat. Assoc.},
  volume =	 63,
  number =	 321,
  year =	 1968,
  pages =	 {1--28},
  entered_on =	 {04/29/04},
  pdf =		 {dependence/mosteller-68.pdf},
  comments =	 {Review on using chi^2 of expected occurences vs
                  actual occurence to distinguish interactions. Review
                  of the iterative algorithm to fit data to
                  marginals.},
}

@article{darroch-62,
  title =	 {Interactions in multi-factor contingency tables},
  author =	 {JN Darroch},
  journal =	 {J. Roy. Stat. Soc. Ser. B (Methodol.)},
  volume =	 24,
  number =	 1,
  year =	 1962,
  pages =	 {251--263},
  pdf =		 {dependence/darroch-62.pdf},
  entered_on =	 {04/29/04},
  comments =	 {Defining highe order interactions in a
                  non-information-theoretic way (using analysis of
                  variances). This paper is reviewed in
                  \cite{ku-kullback-68}. Noticed a problem of fitting
                  many marginals (i.e., three pairwise marginals for 3
                  variables).},
}
@article{ku-kullback-68,
  journal =	 {J. Res. Natl. Bur. Stand. (Math. Sci)},
  volume =	 {72B},
  number =	 3,
  year =	 1968,
  pages =	 {159--200},
  author =	 {HH Ku and S Kullback},
  title =	 {Interaction in multidimensional contingency tables:
                  an information theoretic approach},
  abstract =	 {The problem of interactions in multidimensional
                  contingency tables is investigated from the
                  viewpoint of information theory as developed by
                  Kullback. The hypothesis of no rth-order interaction
                  is defined in the sense of a hypothesis of
                  "generalized" independence of classifications with
                  fixed rth order marginal restraints. For a three-way
                  table, with given cell probabilities \pi_ijk, the
                  minimum discrimination information for a contingency
                  table with marginals p_ij, p _jk, and p _ik is given
                  by the set of cell probabilities p*_ijk = a_ij b_jk
                  c_ik \pi_ijk, where a_ij, b_jk, and c_ik are
                  functions of the given marginal probabilities, that
                  is, ln (p^*_ijk/\pi_ijk) = ln a_ij + ln b_jk +ln
                  c_ik, representing no second-order interaction. The
                  minimum discrimination statistic, asymptotically
                  distributed as \chi^2 with appropriate degress of
                  freedom is 2\sum x_ijk ln x_ijk - 2 \sum x_ijk ln
                  x^*_ijk \ge0, where x_ijk are the observed cell
                  freuencies and x^*_ijk are the "no interaction" cell
                  frequencies uniquely determined by a simple
                  convergent iteration process of the marginals on
                  \pi_ijk. For lower order marginal restraints the
                  usual dependence hypotheses are generated when
                  \pi_ijk are taken to bethe cell probabilities under
                  uniform distribution. It is shown that the set
                  p^*_ijk satisfies definitions of no second order
                  interaction in a 2x2x2 tablegiven by Bartlett and no
                  interactions in a r*s*t table by Roy and Kastenbaum,
                  and is also related to that given by Good. Results
                  of application to the analysis of some "classic"
                  three-dimensional contingency tables are given,
                  together with full detaills for two four-dimensional
                  examples.},
  comments =	 {References to many types of definitions of
                  statistical dependence; very good literature
                  review. In particular \cite{darroch-62} and
                  \cite{roy-kastenbaum-56}. The authors again focus on
                  approximating a distribution by another one with
                  fixed marginals, with marginals not necessarily
                  agreeing with the distribution being
                  approximated. With the distribution being
                  approximated taken as a uniform distribution, we
                  recover analysis of \cite{schneidman-03}. This is
                  section 2.3 of the paper. Eq.2.22, or 3.7 is the
                  maxent formulation of the problem. End of page 172,
                  and page 177 (discussion of mixed interactions) --
                  incomplete set of marginals given -- conditional
                  dependence being analyzed, close to
                  \cite{nemenman-04a}. They do not realize difficulty
                  of inferring entropy from data. They do not define
                  interactions between particular variables, and quite
                  often do not separate "connected" vs disconnected
                  interactions. Great paper!},
  entered_on =	 {04/29/04},
  pdf =		 {dependence/ku-kullback-68.pdf},
}
@article{garner-mcgill-56,
  journal =	 {Psychometrika},
  volume =	 21,
  number =	 3,
  year =	 1956,
  pages =	 {219--228},
  title =	 {The relation between information and variance
                  analysis},
  author =	 {WR Garner and WJ McGill},
  entered_on =	 {04/29/04},
  abstract =	 {Analysis of variance and uncertainty analysis are
                  analogous techniques for partitioning
                  variability. In both analyses negative interacion
                  terms due to negative covariance terms that appear
                  when non-orthogonal predictor variables are allowed
                  may occur.Uncertainties can be estimated directly
                  from variances if the form of the ditribution is
                  assumed. The decision as to which of the techniques
                  to use depends partly on the properties of the
                  criterion variable. Only uncertainty analysis may be
                  used with a non-metric criterion. Since
                  uncertainties are dimensionless (using no metric),
                  however, uncertainty analysis has a generality which
                  may make it useful even when variances can be
                  computed.},
  pdf =		 {dependence/garner-mcgill-56.pdf},
  comments =	 {Another inclusion/exclusion paper. Relationship of
                  varinces to various entropies is discussed.},
}
@article{lancaster-51,
  title =	 {Complex contingency tables treated by the partition
                  of chi^2},
  author =	 {HO Lancaster},
  journal =	 {J. Roy. Stat. Soc. Ser. B (Methodol.)},
  volume =	 13,
  number =	 2,
  year =	 1951,
  pages =	 {242--249},
  entered_on =	 {04/27/04},
  pdf =		 {dependence/lancaster-51.pdf},
  abstract =	 {The partition of chi^2 can be used to investigate
                  the interactions of all orders in the higher
                  contingency tables. The chi^2, obtained by
                  partition, in the case of 2x2x2 table for the second
                  order interaction is asymptoticall equal to that
                  obtained by Bartlett (1935).<p> Difficulties arise
                  when an attempt is made to find an exact
                  solution. The method of obtaining matrices for
                  orthogonal transformations of variables arranged in
                  hierarchical order is explained. The orthogonal
                  transformation in the case of 2x2x2 table is given
                  in full.},
  comments =	 {Contingency tables (that is, discrete data). The
                  dependence is measure using chi^2, which usually
                  assumes normality assumptions, etc. Using chi^2 he
                  partitions the deviations of the counts from the
                  product of marginals into contributions coming from
                  effects of different orders. I don't see him making
                  a distinction between AND and XOR type gates with
                  this analysis (though he probably should), and if he
                  will be able to uncover "induced" interactions, like
                  between the beginning and the end of a regulatory
                  cascade.},
}

@article{lewis-62,
  title =	 {On the analysis of interaction in multi-dimensional
                  contingency tables},
  author =	 {BN Lewis},
  journal =	 {J. Roy. Stat. Soc. Ser. A (General)},
  volume =	 125,
  number =	 1,
  year =	 1962,
  pages =	 {88--117},
  pdf =		 {dependence/lewis-62.pdf},
  entered_on =	 {04/29/04},
  abstract =	 {The paper is mainly concerned to meed the needs of
                  researchers who are faced with the practical problem
                  of interpreting multi-way contingency tables and are
                  unable to obtain guidance from standard text
                  books. It presents a general review of the more
                  important methods of analysis, together with a
                  selection of procedures which are computationally
                  the simplest available, and which may be adapted
                  dfor use with different sampling schemes and/or with
                  theoretical rather than estimated parameters. Since
                  there is no limit to the number of hypotheses that
                  might be proposed in repsect of multiway tables,
                  there can clearly be no claim of completeness, but
                  it is believed that there are no major omissions.},
  comments =	 {Good review. Mutual independence tests, homogeneity
                  of proportions, partial independence (one
                  independent of the rest), marginal independence (one
                  independent of one other, the rest integrated out),
                  conditional independence},
}

@article{fienberg-70,
  author =	 {S Fienberg},
  title =	 {An iterative procedure for estimation of contingency
                  tables},
  journal =	 {Ann. Math. Stat.},
  volume =	 41,
  number =	 3,
  year =	 1970,
  pages =	 {907-917},
  entered_on =	 {04/27/04},
  pdf =		 {dependence/fienberg-70.pdf},
  abstract =	 {Deming and Stephan (1940) first proposed the use of
                  an iterative proportional fitting procedure to
                  estiamte cell probabilities in a contingency table
                  subject to certain marginal constraints. In this
                  paper we first relate this procedure to a variety of
                  sources and a variety of statistical problems. We
                  then describe the procedure geometrically for
                  two-way contingency tables using the concepts
                  presented in Fienberg (1968). This geometrical
                  description leads to a rather simple proof of
                  convergence of the iterative procedure. We conclude
                  the paper with a discussion of extensions to
                  multi-dimensional tables and to tables with some
                  zero entries.},
  commnets =	 {The IPFP algorithm (see also \cite{csiszar-75}) is
                  studied. History of the algorithm is shown with
                  relevant literature connections (a lot of
                  them). Convergene proofs are given. Reference to
                  \cite{mosteller-68} on using IPFP to preserve
                  interaction terms.},
}
@article{ireland-kullback-68,
  title =	 {Contingency tables with given marginals},
  author =	 {CT Ireland and S Kullback},
  journal =	 {Biometrika},
  volume =	 55,
  number =	 1,
  year =	 1968,
  pages =	 {179--188},
  pdf =		 {dependence/ireland-kullback-68.pdf},
  entered_on =	 {04/29/04},
  abstract =	 {In the simplest formulation the problem considered
                  is to estimate the cell probabilities p_ij of an r*c
                  contingency table for which the marginal
                  probabilities p_i and p_j are known and fixed, so as
                  to minimize Dkl(p||\pi), where \pi are the
                  corresponding entries in a given cintingency
                  table. An iterative procedure is given for
                  determining the estimates and it is shown that the
                  estimates are BAN, and that the iterative procedure
                  is convergent. A summary of results for a four-way
                  contingency table is given. An illustrative example
                  is given.},
  comments =	 {See \cite{kullback-68}. This work is relevant to the
                  entropy bases smoothing for learning PDF's I studied
                  in my thesis. Here it's solved only numerically and
                  in a non-Bayesian context. This is probably the
                  first paper in which the general I-projection
                  convergence algorithm is discussed, and its
                  convergence is proven.}
}

@article{kullback-68,
  title =	 {Probability densities with given marginals},
  author =	 {S Kullback},
  journal =	 {Ann. Math. Stat.},
  volume =	 39,
  number =	 4,
  year =	 1968,
  pages =	 {1236--1243},
  pdf =		 {dependence/kullback-68.pdf},
  entered_on =	 {04/28/04},
  comments =	 {Eq. 1.1, it looks like the order of distributions in
                  the KL divergence is inverted from what is
                  "natural". On the other hand, such order guarantees
                  that this divergence never blows up. The problem
                  being solved is to find the best approximation to
                  one density with another density, whose marginals
                  are fixed. A special case of this would be the case,
                  where the marginals agree with the distribution
                  being approximated (that is, what I work on). The
                  iterative algorithm (see also \cite{csiszar-75} for
                  finding such distributions is suggested. The cases
                  considered involve only single variable marginals.},
}
@article{han-78,
  title =	 {Nonnegative entropy measures of multivariate
                  symmetric dependence},
  author =	 {TS Han},
  journal =	 {Information and Control},
  volume =	 36,
  pages =	 {133--156},
  year =	 1978,
  entered_on =	 {04/27/04},
  abstract =	 {A study of nonnegativity "in general" in the
                  symmetric (correlative) entropy space as well as
                  discussions of some related problems is
                  presented. The mainss result is summarozed as
                  Theorems 4.1 and 5.3, which give the necessary and
                  sufficient condition for an element of the symmetric
                  (correlative) entropy space to be nonnegative. In
                  particular, Theorem 4.1 may be regarded as
                  establishing a mathematical foundation for
                  information-theoretic analysis of multivariate
                  symmetric correlations. On the basis of these
                  results, we propose a "hierarchical structure" of
                  probabilistic dependence relations where it is hown
                  that any symmetric correlation associated witha a
                  nonnegative entropy is decomposed into pairwise
                  conditiona and/or nonconditional correlations. A
                  systematic duality existing in the set of
                  nonnegative entropies is also considerably
                  clarified.},
  comments =	 {These entropy measure are related to
                  \cite{studeny-vejnarova-01}. They also appear as
                  linear combinations of entropies, and, therefore,
                  are not interpretable in terms of typicality
                  theorems. They characterize (in some way)
                  conditional independence. The quantity $\Delta e_i$
                  measures total independence between i variables,
                  while $\Delta c_i$ measures conditional independence
                  between the same. $S$ and $D$ are some cumulative
                  measure of correlations, from the lowest order to
                  order $i$. Again, the same fractional problem still
                  applies. },
  pdf =		 {dependence/han-78.pdf},
}


@article{kullback-71,
  journal =	 {Ann. Math. Stat.},
  volume =	 42,
  number =	 2,
  year =	 1971,
  pages =	 {594--606},
  author =	 {S Kullback},
  title =	 {marginal homegeneity of multidimensional contingency
                  tables},
  pdf =		 {dependence/kullback-71.pdf},
  entered_on =	 {02/27/04 </font>},
  abstract =	 {Tests of marginal homogeneity in a two-way
                  contingency table given by [1], [3], [13] do not
                  seem to lend themselves easily to extensions to the
                  problem of m-way marginal homogeneity in a N-way
                  r*r*...*r contingency table, m<N. The principle of
                  minimum discrimination information estimation and
                  the associated minimum discrimination infomration
                  statistic applied in [5] to the problem of marginal
                  homogeneity in a r*r contingency table can be easily
                  extended to the case of a multidimensional
                  contingency table. Estimates of the cell entries
                  under the hypotheses of m-way marginal homogeneity
                  are given. Relationships among the tests of
                  homogeneity for m-way, m=1,2,...,N-1, marginals are
                  given by an analysis of information. Numerical
                  results are given for two sample 3x3x3 tables and
                  two 5x5 tables.},
  comments =	 {Strange, but following \cite{ireland-kullback-68},
                  the KL divergence seems to have the original
                  distribution and its projection in reverse order!
                  Homogeneity means that all of the marginal pdf's are
                  the same! Not very relevant to my work. Proves that
                  multiinfo with higher order statistics >= the one
                  with lower order ones, and other factorizations
                  (theorems 4.1, 4.2).},
}

@article{linden-etal-02,
  journal =	 {Phys. Rev. Lett.},
  volume =	 89,
  pages =	 207901,
  year =	 2002,
  title =	 {Almost Every Pure State of Three Qubits Is
                  Completely Determined by Its Two-Particle Reduced
                  Density Matrices},
  author =	 {N Linden and S Popescu and WK Wootters},
  pdf =		 {dependence/linden-etal-02.pdf},
  entered_on =	 {02/26/04},
  abstract =	 {In a system of n quantum particles, we define a
                  measure of the degree of irreducible n-way
                  correlation, by which we mean the correlation that
                  cannot be accounted for by looking at the states of
                  n1 particles. In the case of almost all pure states
                  of three qubits, we show that there is no such
                  correlation: almost every pure state of three qubits
                  is completely determined by its two-particle reduced
                  density matrices.},
  comments =	 {Development of a log-linear model for quantum
                  systems. That is, instead of binary variables the
                  degrees of freedom are qbits. Need to understand
                  better, but I think that the reason that most of the
                  states are reducible to two-particle specified ones
                  is that one can freely apply unitary matrices to
                  transform states into each other. Even more
                  important reason is probably that for binary
                  variables (qbits) one can use log-linear model
                  expansion of the pdf to get that og 8 parameters
                  that define the log-linear model only one of them
                  means 3-way onteractions (the one that couples all
                  three). In these sense, all other 7 parameters
                  correspond to no 3-way interactions.},
}
@article{csiszar-75,
  title =	 {I-divergence geometry of probability distributions
                  and minimization problems},
  journal =	 {Ann. Probab.},
  volume =	 3,
  number =	 1,
  author =	 {I Csiszar},
  year =	 1975,
  pages =	 {146--158},
  pdf =		 {dependence/csiszar-75.pdf},
  entered_on =	 {04/27/04},
  abstract =	 {some geometric properties of PD's are established,
                  Kullback's I-divergence playing the role of squared
                  Euclidean distance. The minimum discrimination
                  information problem is viewed as that of projecting
                  a PD onto a convex set of PD's and useful existence
                  theorems for and characterizations of the minimizing
                  PD are arrived at. A natural generalization of known
                  iterative algorithm converging to the minimizing PD
                  in special situations is given; even for those
                  special cases, our convergence proof is more
                  generally valid than those previously published. As
                  corolllaries of independent interest,
                  generalizations of known results on the existence of
                  PD's or nonnegative matrices of a certain form are
                  obtained. The Lagrange multiplies technique is not
                  used.},
  comments =	 {Definition of the I-projection, relevant for
                  \cite{nemenman-tishby-04,friedman-etal-01}, and a
                  bunch of results about this projection are
                  proven. Relation between I projections and maxent is
                  discussed. Algorithm for finding I projections is
                  given (IPFP -- iterative proportional fitting
                  procedure). Has all the mathematical details of the
                  concepts involved. Very good and detailed article.},
}

@article{good-63,
  title =	 {Maximum entropy for hypothesis formulation,
                  especially for multidimensional contingency tables},
  journal =	 {Ann. Math. Stat.},
  volume =	 34,
  number =	 3,
  year =	 1963,
  pages =	 {911--934},
  pdf =		 {dependence/good-63.pdf},
  entered_on =	 {04/27/04},
  author =	 {IJ Good},
  abstract =	 {The principle of maximum entropy, together with some
                  generalizations, is interpreted as a heuristic
                  principle for the generation of null hypotheses. The
                  main application is to m-dimensional population
                  contingency tables, with the marginal totals given
                  down to dimension m-r ("restraints of the r'th
                  order"). The principle then leads to the null
                  hypotheis of no "r-th order interactions."
                  Significance tests are given for testing the
                  hypothesis of no s'th orther or higher order
                  interactions, some cases of which have been treated
                  by Bartlett and by Roy and Kastenbaum. It is shown
                  that, if a complete set of r-th order restraints are
                  given, then the hypothesis of the vanishing of all
                  r'th-order and higher-order interactions leads to a
                  unique set of cell probabilities, if the restraints
                  are consistent, but not only just consistent. This
                  confirms and generalizes a recent conjecture due to
                  Darroch. A kind of duality between maximum entopy
                  and maximum likelihood is proved. Some relationships
                  between maximum entropy, interactions, and Markov
                  chains are proved.},
  comments =	 {Maxent form of the pdf consistent with many
                  different dimensional marginals. Examples considered
                  usually (not always) apply to binary variables, but
                  always to variables of the same cardinality and for
                  discrete avriables. References to \cite{brown-59} as
                  the first example of the iterative algorithm to
                  finding maxent ditributions consistent with the
                  marginals. The definition of the interactions
                  (Fourier interaction, eq 7) is somewhat strange. For
                  variables with cardinality 1, nonzero fourier
                  coefficient of the logPDF for the variable means
                  dependence on it, and nonzero fourier coeff for,
                  say, variable product means dependence on it,
                  etc. So measuring existence of the n-th order
                  interaction is then equivalent to seeing if the
                  appropriate fourier coeff. is zero. It is difficult
                  to extend this to variables with higher cardinality
                  and continuous variables, as is done in
                  \cite{schneidman-etal-03,nemenman-04a}.},
}

@book{kullback-59,
  title =	 {Information theory and statistics},
  author =	 {S Kullback},
  publisher =	 {Wiley},
  address =	 {New York},
  year =	 1959,
  entered_on =	 {04/27/04},
  comments =	 {Superceeds all earlier Kullback's articles on the
                  basics of Information theory and its relations to
                  statistics.},
}

@article{mcgill-54,
  author =	 {W McGill},
  journal =	 {IRE Trans. Inf. Thy.},
  volume =	 4,
  pages =	 {93-110},
  year =	 1954,
  entered_on =	 {04/27/04},
  pdf =		 {dependence/mcgill-54.pdf},
  abstract =	 {A multivariate analysis based on transmitted
                  information is presented. It is shown that sample
                  transmitted information provides a simple method for
                  measuring and testing association in
                  multidimensional contingency tables. Relations with
                  analysis of variance are pointed out and statistical
                  tests are described.},
  comments =	 {For variables A,B,C, the paper compares I(A,B) with
                  I(A,B|C), and all other permutations of
                  variables. The difference between these two
                  wuantities (which is the synergy) is supposed to
                  measure the 3-way interactions between the
                  variables. "We see that [synergy] is the gain (or
                  loss) in sample information transmitted between any
                  two of the variables, due to additional knowledge of
                  the third variable." Section 7 discusses the
                  difference between conditional and marginal
                  independence. Section VIII introduces
                  multiinformation. End of the paper discusses bias in
                  estimating information quantities from data, and how
                  one would measure complete independence from data.},
  title =	 {Multivariate information transmission},
}

@article{mcgill-54a,
  author =	 {W McGill},
  journal =	 {Psychometrika},
  volume =	 19,
  pages =	 {97--116},
  year =	 1954,
  entered_on =	 {04/29/04},
  comments =	 {This paper repeats \cite{mcgill-54}.},
  title =	 {Multivariate information transmission},
}

@book{bishop-etal-75,
  title =	 {Discrete multivariate analysis: theory and practice},
  author =	 {Y Bishop and S Fienberg and P Holland},
  year =	 1975,
  entered_on =	 {04/25/04},
  publisher =	 {MIT Press},
  address =	 {Cambridge, MA},
  comments =	 {Standard text book. Included description of
                  log-linear models. See log-linear models of uniform
                  order for a connection to connected
                  multiinformation.}
}


@article{watanabe-60,
  title =	 {Information Theoretical Analysis of Multivariate
                  Correlation},
  author =	 {S Watanabe},
  abstract =	 {A set of stochastic variables, y1,y2,...,yn, is
                  grouped into subsets, 1,2,...,k. The correlation
                  existing in with respect to the 's is adequately
                  expressed by $.....$ where S() is the entropy
                  function defined with reference to the variables y
                  in subset . For a given , C becomes maximum when
                  each i consists of only one variable, (n=k). The
                  value C is then called the total correlation in ,
                  Ctot(). The present paper gives various theorems,
                  according to which Ctot() can be decomposed in terms
                  of the partial correlations existing in subsets of ,
                  and of quantities derivable therefrom. The
                  information-theoretical meaning of each
                  decomposition is carefully explained. As
                  illustrations, two problems are discussed at the end
                  of the paper: (1) redundancy in geometrical figures
                  in pattern recognition, and (2) randomization effect
                  of shuffling cards marked "zero" or "one." },
  entered_on =	 {04/25/04},
  pdf =		 {dependence/watanabe-60.pdf},
  volume =	 4,
  number =	 1,
  journal =	 {IBM J. of Research and Development},
  year =	 1960,
  pages =	 {66--82},
  comments =	 {Multiinformation (total correlation) between
                  collection of variables is considered. Possible way
                  to axiomatize multiinfo --theorem on page 5, that is
                  -- branching variables into groups produces the same
                  multiinfo independent of the order of
                  branching. Synergies are also considered, but termed
                  without "profound meaning", though "beautiful". The
                  information "at a given level" is given in terms of
                  the average multiinfo within all pairs, triplets,
                  etc. These are not "connected", but accummulated
                  quantities. The paper also considers what in the
                  language of \cite{bnt-01} is $NS(1) - S(N)$ for a
                  few cases (Markovian, etc.). He also studies $-S(N)
                  +2 S(N-1) -S(N-2) \approx d^2 S(N) /dN^2$ for large
                  $N$. This is the derivative of the universal
                  learning curve in \cite{bnt-01}. We should have
                  cited this paper in \cite{bnt-01}.},
}
@article{schneidman-etal-03,
  journal =	 {Phys. Rev. Lett.},
  volume =	 91,
  pages =	 {238701},
  year =	 2003,
  title =	 {Network Information and Connected Correlations},
  author =	 {E Schneidman and S Still and MJ Berry and W Bialek},
  abstract =	 {Entropy and information provide natural measures of
                  correlation among elements in a network. We
                  construct here the information theoretic analog of
                  connected correlation functions: irreducible N-point
                  correlation is measured by a decrease in entropy for
                  the joint distribution of N variables relative to
                  the maximum entropy allowed by all the observed N1
                  variable distributions. We calculate the "connected
                  information" terms for several examples and show
                  that it also enables the decomposition of the
                  information that is carried by a population of
                  elements about an outside source.},
  pdf =		 {dependence/schneidman-etal-03.pdf},
  comments =	 {Good bibliography. The paper clearly introduces connected
                  correlations by means of maximum entropy
                  approximations and analyzes a few simple exmaples.}
}

@techreport{bell-02,
  author =	 {AJ Bell},
  year =	 2002,
  title =	 {Co-Information Lattice},
  url =		 {http://www.rni.org/pubs.html},
  institution =	 {Redwood Neuroscience Institute},
  number =	 {RNI-TR-02-1},
  entered_on =	 {04/25/04},
  pdf =		 {dependence/bell-02.pdf},
  abstract =	 {In 1955, McGill published a multivariate
                  generalisation of Shannon's mutual
                  information. Algorithms such as Independent
                  Component Analysis use a different generalisation,
                  the redundancy,or multi-information [13]. McGill's
                  concept expresses the information shared by all of K
                  random variables, while the multi-information
                  expresses the information shared by any two or more
                  of them. Partly to avoid confusion with the
                  multiinformation, I call his concept here the
                  co-information. Co-informations, oddly, can be
                  negative. They form a partially ordered set, or
                  lattice, as do the entropies. Entropies and
                  co-informations are simply and symmetrically related
                  by Moebius inversion [12]. The co-information
                  lattice sheds light on the problem of approximating
                  a joint density with a set of marginal densities,
                  though as usual we run into the partition
                  function. Since the marginals correspond to
                  higher-order edges in Bayesian hypergraphs, this
                  approach motivates new algorithms such as Dependent
                  Component Analysis, which we describe, and (loopy)
                  Generalised Belief Propagation on hypergraphs, which
                  we do not. Simulations of subspace-ICA (a tractable
                  DCA) on natural images are presented on the web. In
                  neural computation theory, we identify the
                  coinformation of a group of neurons (possibly in
                  space/ time staggered patterns) with the "degree of
                  existence" of a corresponding cell assembly.},
  comments =	 {The synergy/redundancy is described here, termed
                  co-information.},
}

@InCollection{studeny-vejnarova-98,
  author =	 {M Studeny and J Vejnarova},
  title =	 {The multiinformation function as a tool for
                  measuring stochastic dependence},
  booktitle =	 {Learning in Graphical Models},
  publisher =	 {Kluwer},
  year =	 {1998},
  editor =	 {MI Jordan},
  address =	 {Dordrecht},
  pages =	 {261--298},
  postscript =	 {dependence/studeny-vejnarova-98.ps.gz},
  entered_on =	 {04/25/04},
  abstract =	 {Given a collection of random variables X(i) where i
                  belongs to a finite nonempty set N, the
                  corresponding multiinformation function ascribes (to
                  every subset A of N) the relative entropy of the
                  joint distribution of X(A) (that is, the random
                  vector composed of the variables X(i) where i
                  belongs to A) with respect to the product of
                  distributions of individual random variables X(i)
                  through i in A. We argue that it is a useful tool
                  for solving the problems concerning stochastic
                  (conditional) dependence and independence (at least
                  in discrete case). First, it makes possible to
                  express the conditional mutual information between
                  X(A) and X(B) given X(C) (for every disjoint subsets
                  A,B,C of N) which can be considered as a good
                  measure of conditional stochastic depedence. Second,
                  one can introduce reasonable measures of dependence
                  of level r among variables of X(A) (where A is a
                  subset of N and 0 < r < card A) which are
                  expressible by means of the multiinformation
                  function. Third, it enables one to derive
                  theoretical results on (nonexistence of an)
                  axiomatic characterization of stochastic conditional
                  independence models.},
  comments =	 {Axiomatics for conditional dependence through
                  multiiinformation and its derivatives. On page 10
                  they discuss building a distribution consistent with
                  two pairwise marginals AC, BC and checking what the
                  condintional mutual info I(AB|C) may be. This is
                  easy and easily bound in terms of the cascade
                  Bayesian model. The problem would appear if all 3
                  marginals are known, but they do not discuss
                  this. They discuss the problem of isolating
                  dependencies at different levels (again, in terms of
                  linear combinations of mutual informations); the
                  function they produce ($\Delta$) is neither
                  connected multiinfo, nor the synergy/redundancy. It
                  is always nonnegative, which is good for a
                  definition of interactions. However, for example,
                  for a 3-node network, their definition of the
                  triplet info is: $\Delta = I(ABC) -
                  2[I(AB)+I(AC)+I(BC)]$. For a pdf $P(ABC) =
                  P(AB)*P(C)$ this reduces to $\Delta = I(AB)$. This
                  distribution clearly has no 3-way dependencies, but
                  does have them according to the author's
                  definition. Further, in context of coding,
                  fractional powers of probabilities are problematic,
                  as they will not arrive at the typicality
                  theorems. Thus $\Delta$ is not good quantity. Prop
                  5.1 (axiomatics) is probably irrelevant to my
                  work.},
}


@inproceedings{goodman-johnson-04,
  title =	 {Orthogonal decomposition of multivariate statistical
                  dependence measures},
  author =	 {I Goodman and D Johnson},
  abstract =	 {We describe two multivariate statistical dependence
                  measures which can be orthogonally decomposed to
                  separate the effects of pairwise, triplewise, and
                  highe order interactions between theh random
                  variables. These decompositions provide a convenient
                  method of analyzing statistical dependencies between
                  large groups of random variables, within which
                  smaller "sub-groups" may exhibit dependencies
                  separately from the rest of the variables. The first
                  dependence measure is a generalization of Pearson's
                  $\phi^2$, and we decompose it using an orthonormal
                  series expansion of joint proability density
                  functions. The second measure is based on
                  Kullback-Leibler distance, and we decompose it using
                  information geometry. Applications of these
                  techniques include analysis of neural population
                  recordings and multimodal sensor fusion. We discuss
                  in detail the simple example of three jointly
                  defined binary random variables.},
  pdf =		 {dependence/goodman-johnson-04.pdf},
  entered_on =	 {04/25/04},
  booktitle =	 {2004 Int. Conf. Acoustics, Speech, and Signal
                  Processing (ICASSP)},
  publisher =	 {IEEE},
  comments =	 {See \cite{goodman-johnson-03,amari-01}.},
  year =	 2004,
}


@article{matsuda-00,
  journal =	 {Phys. Rev. E},
  volume =	 62,
  pages =	 {3096--3102},
  year =	 2000,
  title =	 {Physical nature of higher-order mutual information:
                  Intrinsic correlations and frustration},
  author =	 {Hiroyuki Matsuda},
  abstract =	 {This paper studies some properties and implications
                  of higher-order mutual information functions, which
                  should serve for the analysis of general complex
                  systems. We note that the higher-order mutual
                  information can either be positive or negative
                  depending on the correlation among ensembles. Two
                  opposite types of correlations are discussed in
                  connection with the concept of frustration. Simple
                  examples are presented to demonstrate that our
                  concepts are especially helpful in understanding the
                  nature of correlations in frustrated systems. The
                  higher-order mutual information provides an
                  appropriate measure of the frustration effect.},
  entered_on =	 {04/25/04},
  pdf =		 {dependence/matsuda-00.pdf},
  comments =	 {"Higher order mutual info" (with the given reference
                  to Cerf et al are just exclusion/inclusion
                  synergies. Not very interesting. Expansion of
                  multiinfo in terms of redundancies is given. This is
                  relevant for \cite{nemenman-tishby-04}. Nice
                  interpretation of synergy (negative redundancy) in
                  terms of frustration in spin systems (section III).},
}

@article{schlather-tawn-03,
  journal =	 {Biometrika},
  year =	 2003,
  volume =	 90,
  pages =	 {139--156},
  title =	 {A dependence measure for multivariate and spatial
                  extreme values: {P}roperties and inference},
  author =	 {Martin Schlather and Jonathan Tawn},
  abstract =	 {We present properties of a dependence measure that
                  arises in the study of extreme values in
                  multivariate and spatial problems. For multivariate
                  problems the dependence measure characterises
                  dependence at the bivariate level, for all pairs all
                  all higher orders up to and including the dimension
                  of the variable. Necessary and sufficient conditions
                  are given for subsets of dependence measures to be
                  self-consistent, that is to guarantee the existence
                  of a distribution with such a subset of values for
                  the dependence measure. For pairwise dependence,
                  these conditions are given in terms of positive
                  semidefinite matrices and non-diferentiable,
                  positive definite functions. We construct new
                  nonparametric estimators for the dependence measure
                  which, unlike all naive nonparametric estimators,
                  impose these solf-consistency properties. As the new
                  estimators provide an improvement on the naive
                  methods, both in terms of the inferential and
                  interpretability properties, their use in
                  exploratory extreme value analyses should aid the
                  identification of appropriate dependence models. The
                  methods are illustrated through an analysis of
                  simulated multivariate data, which shows that a lack
                  of self-consistency is frequently a problems with
                  the existing estimators, and by a spatial analysis
                  of daily rainfall extremes in south-west England,
                  which finds a smooth decay in extremal dependence
                  with distance.},
  pdf =		 {dependence/schlather-tawn-03.pdf},
  entered_on =	 {04/22/04},
  comments =	 {Some concept of hierarchical dependence; not really
                  relevant for me.},
}

@inproceedings{abdous-etal-04,
  author =	 {B Abdous and A Berred and A Khoudraji},
  title =	 {Multivariate tail dependence measures},
  booktitle = {Int. Conf. on Distribution Theory, Order Statistics
                  and Inference in Honor of Barry C. Arnold},
  entered_on =	 {04/12/04},
  pdf =		 {dependence/abdous-etal-04.pdf},
  comments =	 {Asymptotical (extreme value) dependence between
                  variables is analyzed. See also
                  \cite{schlather-tawn-03}.},
  year =	 2004,
}

@misc{hsing-etal-03,
  howpublished = {Preprint},
  pdf =		 {dependence/hsing-etal-03.pdf},
  entered_on =	 {04/22/04},
  title =	 {Dependence estimation and vizualization in
                  multivariate extremes with applications to financial
                  data},
  author =	 {T Hsing and C Kluppelberg and G Kuhn},
  abstract =	 {We investigate extreme dependence in a multivariate
                  setting with special emphasis on financial
                  applications. We introduce a new dependence function
                  which allows us to capture the complete extreme
                  dependence structure and present a nonparametric
                  estimation procedure. The new dependence function is
                  compared with the existing measures including the
                  spectral measure and other devices measuring extreme
                  dependencies. We also apply our method to a
                  financial data set of zero coupon swap rates and
                  estimate extreme dependence in the data.},
  comments =	 {See also \cite{schlather-tawn-03}.},
}

@inproceedings{goodman-johnson-03,
  author =	 {IN Goodman and DH Johnson},
  title =	 {New multivariate dependence measures and
                  applications to neural ensembles},
  booktitle =	 {Proc. of the 2003 IEEE Workshop on Statistical
                  Signal Processing},
  year =	 2003,
  pdf =		 {dependence/goodman-johnson-03.pdf},
  abstract =	 {We develop two new multivariate statistical
                  dependence measures. The first, based on the
                  Kullback-Leibler distance, results in a single value
                  that indicates the general level of dependence among
                  the random variables. The second, based on the
                  orthonormal series expansion of joint probability
                  density functions, provides more detail about the
                  nature of the dependence. We apply these dependence
                  measures to the analysis of simultaneous recordings
                  made from multiple neurons, in which dependencies
                  are time-varying and potentially information
                  bearing.},
  entered_on =	 {04/22/04},
  comments =	 {The first measure introduced is pretty much the
                  multiinformation -- the authors need to read the
                  relevant literature. The second measure is similar
                  in spirit to \cite{barrett-lampard-55}. In my view,
                  this is not the most intuitive way to define
                  dependence, but this is a matter of taste.},
}


@article{bezzi-etal-02,
  title =	 {Redundancy and Synergy Arising from Pairwise
                  Correlations in Neuronal Ensembles},
  author =	 {M Bezzi and ME Diamond and A Treves},
  abstract =	 {Multielectrode arrays allow recording of the
                  activity of many single neurons, from which
                  correlations can be calculated. The functional roles
                  of correlations can be revealed by measures of the
                  information conveyed by neuronal activity; a simple
                  formula has been shown to discriminate the
                  information transmitted by individual spikes from
                  the positive or negative contributions due to
                  correlations (Panzeri et al., 1999). Here, this
                  analysis, previously applied to recordings from
                  small ensembles, is developed further by considering
                  a model of a large ensemble, in which correlations
                  among the signal and noise components of neuronal
                  firing are small in absolute value and entirely
                  random in origin. Even such small random
                  correlations are shown to lead to large possible
                  synergy or redundancy, whenever the time window for
                  extracting information from neuronal firing extends
                  to the order of the mean interspike interval. In
                  addition, a sample of recordings from rat barrel
                  cortex illustrates the mean time window at which
                  such corrections dominate when correlations are,
                  as often in the real brain, neither random nor
                  small. The presence of this kind of correlations for
                  a large ensemble of cells restricts further the time
                  of validity of the expansion. },
  journal =	 {J. Comp. Neurosci.},
  volume =	 12,
  pages =	 {165--174},
  year =	 2002,
  entered_on =	 {04/22/04},
  pdf =		 {dependence/bezzi-etal-02.pdf},
  comments =	 {Synergy/redundancy discussion with good literature
                  collection for multielectrode recordings. It's
                  unclear to me why one should study information rate
                  in the limit of the window size t\to0, vs
                  t\to\infty. Did not read carefully.},
}

@article{lancaster-58,
  author =	 {HO Lancaster},
  title =	 {The structure of bivariate disctributions},
  journal =	 {Ann. Math. Stat.},
  volume =	 29,
  number =	 3,
  year =	 1958,
  pages =	 {719--736},
  entered_on =	 {04/22/04},
  pdf =		 {dependence/lancaster-58.pdf},
  comments =	 {Various measures of dependency between two variables
                  are studied. See also \cite{barrett-lampard-55}.},
}

@article{amari-01,
  journal =	 {IEEE Trans. Inf. Thy.},
  volume =	 47,
  number =	 5,
  year =	 2001,
  pages =	 {1701--1711},
  title =	 {Information Geometry on Hierarchy of Probability
                  Distributions},
  author =	 {Shun-ichi Amari},
  entered_on =	 {04/22/04},
  abstract =	 {An exponential family or mixture family of
                  probability distributions has a natural hierarchical
                  structure. This paper gives an "orthogonal"
                  decomposition of such a system based on information
                  geometry. A typical example is the decomposition of
                  stochastic dependency among a number of random
                  variables. In general, they have a complex structure
                  of dependencies. Pairwise dependency is easily
                  represented by correlation, but it is more difficult
                  to measure effects of pure triplewise or higher
                  order interactions (dependencies) among these
                  variables. Stochastic dependency is decomposed
                  quantitatively into an "orthogonal" sum of pairwise,
                  triplewise, and further higher order
                  dependencies. This gives a new invariant
                  decomposition of joint entropy. This problem is
                  important for extracting intrinsic interactions in
                  firing patterns of an ensemble of neurons and for
                  estimating its functional connections. The
                  orthogonal decomposition is given in a wide class of
                  hierarchical structures including both exponential
                  and mixture families. As an example, we decompose
                  the dependency in a higher order Markov chain into a
                  sum of those in various lower order Markov chains.},
  comments =	 {This work deals *only* with exponential families of
                  distributions. Good collection of references to
                  "information geometry" and differential geometry
                  formulation of inference, and reasonably
                  understandable review. Note on relation of the
                  cumulant generating function and the entropy as the
                  Legendre transform variables for the exponential
                  family, and how the KL divergence for this family is
                  the amount by which two distributions do not satisfy
                  this Legendre transform (eqs. 22, 23). Relations of
                  this geometric construction to maxent principle for
                  exponential distributions is described (Theorem
                  7). The paper has expansion of the multiinformation
                  in terms of the lower order "connected"
                  informations. For general distributions, the authors
                  has nothing to say beoyong the general maxent
                  construction, but he seems to have beat
                  \cite{schneidman-etal03}.},
  prf =		 {dependence/amari-01.pdf},
}


@article{barrett-lampard-55,
  year =	 1955,
  author =	 {J Barrett and D Lampard},
  title =	 {An expansion for some second-order probability
                  distributions and its application to noise problems},
  journal =	 {IRE Trans. Inf. Thy.},
  pages =	 {10--15},
  volume =	 1,
  pdf =		 {dependence/barrett-lampard-55.pdf},
  entered_on =	 {04/22/04},
  abstract =	 {In this paper it is shown that, in general,
                  second-order probability distributions may be
                  expanded in a certain double series involving
                  orthogonal polynomials associated with the
                  corresponding first order probability
                  distributions. Attention is restricted to those
                  seocnd order probability distributions which lead to
                  a "diaginal" form for this expansion.<p> When such
                  distributions are joint probability distributions
                  for samples taken from a pair of time series, some
                  interesting results can be demonstrated. For
                  example, it is shown that if one of the time series
                  undergoes an amplitude distortion in a time varying
                  "instantaneous" nonlinear device, the covariance
                  function after distortion is simply proportional to
                  that before distortion. <p>Some simple results
                  concerning conditional expectations are given and an
                  extension of a theorem, due to Doob, on stationary
                  Markov processes is presneted.<p> The relation
                  between the "diagonal" expansion used in this paper
                  and the Mercer expansion of the Kernel of a certain
                  linear homogeneous integral equation, is pointed out
                  and in conclusion explicit expansions are given for
                  three specific examples.},
  comments =	 {See the abstract.}
}

@book{agresti-90,
  author =	 {A Agresti},
  title =	 {Categorical Data Analysis},
  address =	 {New York, NY},
  publisher =	 {Wiley},
  year =	 1990,
  entered_on =	 {04/22/04},
  comments =	 {According to \cite{amari-01}, this should have a
                  good discussion of standard log-linear models for
                  categorical variables.}
}
 
@book{joe-97,
  author =	 {Harry Joe},
  title =	 {Multivariate models and dependence concepts},
  address =	 {Boca Raton, FL},
  publisher =	 {Chapman \& Hall},
  year =	 1997,
}

@InProceedings{chechik-etal-02,
  author =	 {G Chechik and A Globerson and MJ Anderson and ED
                  Young and I Nelken and N Tishby},
  title =	 {Groups redundancy measures reveal redundancy
                  reduction along the auditory pathway},
  booktitle =	 {Advances in Neural Information Processing Systems
                  14},
  year =	 2002,
  editor =	 {TG Dietterich and S Becker and Z Ghahramani},
  address =	 {Cambridge, MA},
  publisher =	 {MIT Press},
  entered_on =	 {04/21/04},
  abstract =	 {The way groups of auditory neurons interact to code
                  acoustic information is investigated using an
                  information theoretic approach. Identifying the case
                  of stimulus-conditioned independent neurons, we
                  develop redundancy measures that allow enhanced
                  information estimation for groups of neurons. These
                  measures are then applied to study the collaborative
                  coding efficiency in two processing stations in the
                  auditory pathway: the inferior colliculus (IC) and
                  the primary auditory cortex (A1). Under two
                  different coding paradigms we show differences in
                  both information content and group redundancies
                  between IC and cortical auditory neurons. These
                  results provide for the first time a direct evidence
                  for redundancy reduction along the ascending
                  auditory pathway, as has been hypothesized by Barlow
                  (1959). The redundancy effects under the
                  single-spikes coding paradigm are significant only
                  for groups larger than ten cells, and cannot be
                  revealed with the standard redundancy measures that
                  use only pairs of cells. Our results suggest that
                  redundancy reduction transformations are not limited
                  to low level sensory processing (aimed to reduce
                  redundancy in input statistics) but are applied even
                  at cortical sensory stations.},
  pdf =		 {neural/chechik-etal-01.pdf},
  comments =	 {Define redundancy and synergy in neural code.},
}


@article{martignon-etal-00,
  author =	 {L Martignon and G Deco and K Laskey and M Diamond
                  and W Freiwald and E Vaadia},
  title =	 {Neural Coding: Higher-Order Temporal Patterns in the
                  Neurostatistics of Cell Assemblies },
  journal =	 {Neural Comput.},
  volume =	 12,
  pages =	 {2621--2653},
  year =	 2000,
  pdf =		 {dependence/martignon-etal-00.pdf},
  entered_on =	 {04/21/04},
  abstract =	 {Recent advances in the technology of multiunit
                  recordings make it possible to test Hebb's
                  hypothesis that neurons do not function in isolation
                  but are organized in assemblies. This has created
                  the need for statistical approaches to detecting the
                  presence of spatiotemporal patterns of more than two
                  neurons in neuron spike train data. We mention three
                  possible measures for the presence of higher-order
                  patterns of neural activation-coefficients of
                  log-linear models, connected cumulants, and
                  redundancies-and present arguments in favor of the
                  coefficients of log-linear models. We present test
                  statistics for detecting the presence of
                  higher-order interactions in spike train data by
                  parameterizing these interactions in terms of
                  coefficients of log-linear models. We also present a
                  Bayesian approach for inferring the existence or
                  absence of interactions and estimating their
                  strength. The two methods, the frequentist and the
                  Bayesian one, are shown to be consistent in the
                  sense that interactions that are detected by either
                  method also tend to be detected by the other. A
                  heuristic for the analysis of temporal patterns is
                  also proposed. Finally, a Bayesian test is presented
                  that establishes stochastic differences between
                  recorded segments of data. The methods are applied
                  to experimental data and synthetic data drawn from
                  our statistical models. Our experimental data are
                  drawn from multiunit recordings in the prefrontal
                  cortex of behaving monkeys, the somatosensory cortex
                  of anesthetized rats, and multiunit recordings in
                  the visual cortex of behaving monkeys.},
  comments =	 {For binary variables, maxent distributions are
                  log-linear. This case is analyzed here, and test
                  statistics are suggested to distinguish various
                  types of dependence.},
}

@book{johnson-wichern-02,
  author =	 {R Johnson and D Wichern},
  title =	 {Applied multivariate statistical analysis},
  publisher =	 {Prentice Hall},
  address =	 {Upper Saddle River, NJ},
  year =	 2002,
  edition =	 {5th},
  entered_on =	 {04/21/04},
  comments =	 {Standard textbook on the subject.},
}

@article{soofi-92,
  title =	 {A generalized formulation of conditional logit with
                  diagnostics},
  author =	 {E. Soofi},
  journal =	 {J. Amer. Stat. Assoc.},
  volume =	 87,
  number =	 419,
  year =	 1992,
  pages =	 {812--816},
  pdf =		 {dependence/soofi_92.pdf},
  entered_on =	 {05/03/04},
  abstract =	 {The conditional logit model is a multinomial logit
                  model that permits the inclusion of choice-specific
                  attributes. The article shows that the conditional
                  logit model will maximize entropy given a set of
                  attribute-value preserving constraints. A
                  correspondence between the maximum entopy (ME) and
                  maximum likelihood (ML) estimates for logit
                  probabilities is established. Some easily computable
                  and useful diagnostics for logit analysis are
                  provided, and it is shown than an evaluation of the
                  relative importance of attributes can be made using
                  the ME formulation. The ME formulation is also
                  generalized to accommodate initial choice
                  probabilities into the logit model. An exampleis
                  given.},
  comments =	 {Logit-linear model, maxent treatment. Does not
                  really tell us whether there is a particular
                  interaction; builds up interactions one by one,
                  possibly in wrong order, and summs up multiinfo as
                  in \cite{schneidman-etal-03}. Cannot tell if a
                  particular interaction is to be kept, cannot return
                  back and replace an interaction by a better, more
                  potent one between other variables, which induces
                  the given one.}
}

@article{joe-89relative,
  title =	 {Relative Entropy Measures of Multivariate
                  Dependence},
  author =	 {Harry Joe},
  journal =	 {J. Amer. Stat. Assoc.},
  volume =	 84,
  number =	 405,
  year =	 1989,
  pages =	 {157--164},
  pdf =		 {dependence/joe-89.pdf},
  comments =	 {Conditional mutual information is
                  evaluated. Requiremenets for "dependence measure"
                  given},
}@article{good-53,
  author =	 {I J Good},
  title =	 {The population frequencies of species and the
                  estimation of population parameters},
  journal =	 {Biometrika},
  volume =	 40,
  number =	 {3/4},
  year =	 1953,
  pages =	 {237--264},
  abstract =	 {A random sample is drawn from a population of
                  various species. (The theory may also be applied to
                  studies of literary vocabulary, for example). If a
                  particular species is represented r times in the
                  sample of size N, then r/N is not a good estimate of
                  the population frequency, p, when r is
                  small. Methods are given for estimating p, assuming
                  virtually nothing about the underlying
                  population. The estimates are expressed in terms of
                  smoothed values of the numbers n_r (r=1,2,3,...),
                  where n_r is the number of distinct species that are
                  each represented r times in the sample. (n_r may be
                  described as "the frequency of the frequency r".)
                  Turing is acknowledged for the most interesting
                  formula in this part of the work. An estimate of the
                  proportion of the population represented by the
                  species occuring in the sample is an immediate
                  corollary. Estimates are made of measures of
                  heterogeneity of the population, including Yule's
                  "characteristic" and Shannon's "entropy". Methods
                  are then discussed that do depend on assumptions
                  abouth the underlying population. It is here that
                  most work has been done by other writers. It is
                  pointed out that a hypothesis can give a good fit to
                  the numbers n_r but can give quite the wrong values
                  for the Yule's characteristics. An example of this
                  is Fisher's fit to some data of Williams's on
                  Macrolepidopters.},
  pdf =		 {discrete_learning/good-53.pdf},
  entered_on =	 {06/30/04},
  comments =	 {The paper introduce the now famous Good-Turing
                  estimator of probability masses for a discrete
                  variable. It is shown that the proper estimate of
                  counts is to replace them by (n+1)*(# of times
                  counts of n+1 has occured in data)/(# of time counts
                  of n have occured in data), and in their turns these
                  "# of times" (the rank ordered plot in today's
                  terminology) should itself be smoothed. This result
                  itself is obtained under the assumption that the #
                  of bins with given counts after N samples is roughly
                  the same as after N+1 samples, that is, N>>1 (I
                  would prefer to have formal expansions in the paper,
                  but...). This is not necessarily true for bins with
                  just a 1 count for cases if there are a lot of such
                  bins (it is likely that a new sample will fall into
                  a new bin). This is also not true if the number of
                  bins with given counts is small, as a new sample
                  will change counts in a bin and, therefore, change
                  their number. The Dirichlet prior result is a
                  special case of this analysis. The main point of the
                  paper then is that the rank ordered plot can be
                  smoothed to provide a better estimate of the
                  probability masses. The smoothing technique
                  represents a priori assumptions about the underlying
                  probability distribution, and fitting to a
                  particular rank ordered form mean determining the
                  underlying hyperparameters. In particular, if one
                  fits the rank ordered plot to the form in
                  \cite{nsb}, and estimator similar to NSB will be
                  obtained (Good does not do that, though he considers
                  a bunch of different smoothings). Using such
                  smoothing, Good suggests an entropy estimator
                  (Eq. 39), which we should test against NSB. There
                  are some substantial differences between the
                  Good-Turing estimator (even fitted to the Dirichlet
                  form) and NSB. The first one aims at unbiased
                  estimator of probabilities, which is not the same as
                  unbiased entropy estimator (this is why we have a
                  Jacobian in the NSB prior). Secondly, in order to
                  smooth things out, GT requires reasonably large
                  range of counts, from zero all the way to many. Thus
                  this is also a perturbative estimator, which
                  produces small changes about the ML estimation of
                  entropies (nice homework problem for students),
                  while NSB is expected to work in some nonasymptotic
                  case. Finally, NSB averages over the smoothing
                  parameter, while GT selects a particular value. This
                  may be very important when there are only few
                  coincidences (and the range of counts is small), and
                  when the hyperparameters cannot be determined
                  precisely. GT in such case will make a large error,
                  while NSB will behave better due to
                  averaging. Overall, in the asymptotic regime GT will
                  beat NSB, but in this case other empirical entropy
                  estimator (like \cite{strong-98}) will be even
                  better. In preasymptotic regime, GT will be no good,
                  while NSB will be good in some cases. For future
                  reference, this paper is also useful in analysing
                  what is the relation between the rank ordered plots
                  of a distribution and of its counts. },
}

@article{poschel-freund-02,
  journal =	 {Phys. Rev. E},
  volume =	 66,
  pages =	 026103,
  year =	 2002,
  title =	 {Finite-sample frequency distributions originating
                  from an equiprobability distribution},
  author =	 {Thorsten Poschel and Jan A. Freund },
  abstract =	 { Given an equidistribution for probabilities p(i) =
                  1/N, i = 1,,N. What is the expected corresponding
                  rank ordered frequency distribution f(i), i = 1,,N,
                  if an ensemble of M events is drawn?},
  pdf =		 {discrete_learning/poschel-freund-02.pdf},
  comments =	 {Related to
                  \cite{poschel-freund-03,poschel-etal-03}. I believe,
                  \cite{paninski-03} did it better.},
  entered_on =	 {05/10/04},
}

@article{poschel-freund-03,
  journal =	 {Biosystems},
  volume =	 69,
  year =	 2003,
  pages =	 {63--72},
  title =	 {How to decide whether small samples comply with an
                  equidistribution },
  author =	 {T Poschel and J A Freund},
  abstract =	 {The decision whether a measured distribution
                  complies with an equidistribution is a central
                  element of many biostatistical methods. High
                  throughput differential expression measurements, for
                  instance, necessitate to judge possible
                  over-representation of genes. The reliability of
                  this judgement, however, is strongly affected when
                  rarely expressed genes are pooled. We propose a
                  method that can be applied to frequency ranked
                  distributions and that yields a simple but efficient
                  criterion to assess the hypothesis of equiprobable
                  expression levels. By applying our technique to
                  surrogate data we exemplify how the decision
                  criterion can differentiate between a true
                  equidistribution and a triangular distribution. The
                  distinction succeeds even for small sample sizes
                  where standard tests of significance (e.g, \xi^2)
                  fail. Our method will have a major impact on several
                  problems of computational biology where rare events
                  baffle a reliable assessment of frequency
                  distributions. The program package is available upon
                  request from the authors. },
  entered_on =	 {05/10/04},
  pdf =		 {discrete_learning/poschel-freund-03.pdf},
  comments =	 {Related to \cite{poschel-etal-03}. The idea is that
                  undersampling produces a definite rank-ordered plot
                  for the uniform distribution, and one should look at
                  the actual rank ordered form and compare it to the
                  one expected for the uniform distribution. I believe
                  that tests of homogeneity have been suggested
                  earlier in classical statistics literature (like in
                  the /dependence section of my library). Further, a
                  clear way to test for homogeneity is to estimate the
                  entropy either by \cite{nsb} or \cite{paninski-03}
                  and see if it is eual to log K. In particular,
                  Paninski's estimator is also in some sense rank
                  order based. The current work will produce results
                  only for N\sim K, while the NSB estimator will test
                  for N\sim \sqrt{K} (language of NSB). As uniform
                  distribution is typical in NSB, we should estimate
                  its entropy very precisely from small samples. Some
                  nice combinatorics results: probability of exactly k
                  bins to have exactly i samples in a uniform
                  distribution, etc. The method works by estimating K
                  from the number of bins tht have got i counts. Such
                  estimation is done for all i, and if all estimated
                  K's agree, the underlying distribution is
                  uniform. In view of entropy estimation papers, this
                  work is almost useless.},
}
@article{poschel-etal-03,
  title =	 {Correction algorithm for finite sample statistics},
  author =	 {Thorsten Poschel and Werner Ebeling and Cornelius
                  Froemmel and Rosa Ramirez},
  url =		 {htpp://arxiv.org/abs/q-bio/0401040},
  journal =	 {Eur. Phys. J. E},
  volume =	 {12},
  pages =	 {531--541},
  year =	 {2003},
  pdf =		 {discrete_learning/poschel-etal-03.pdf},
  abstract =	 {Assume in a sample of size M one finds M_i
                  representatives of species i with i=1...N^*. The
                  normalized frequency p^*_i=M_i/M, based on the
                  finite sample, may deviate considerably from the
                  true probabilities p_i. We propose a method to infer
                  rank-ordered true probabilities r_i from measured
                  frequencies M_i. We show that the rank-ordered
                  probabilities provide important informations on the
                  system, e.g., the true number of species, the
                  Shannon- and the Renyi-entropies.},
  entered_on =	 {04/13/04},
  comments =	 {Good idea, less than perfect implementation. The key
                  equation (17) is not supported analytically. All of
                  the analysis is done numerically, not even simple,
                  linearized approximations are done analytically to
                  understand when the program will work. The idea to
                  learn entropy by rank-ordering and restoring the
                  order is basically that of \cite{paninski-03}. But,
                  agai, pointing the difference between rank order
                  frequencies and probabilities is a good thing to
                  do.},
}

@incollection{mcallester-ortiz-03,
  booktitle =	 {Adv. in Neur. Inf. Proc. Syst. 15 (2002)},
  author =	 {David McAllester and Luis Ortiz},
  title =	 {Concentration Inequalities for the Missing Mass and
                  for Histogram Rule Error },
  editor =	 {S. Becker and S. Thrun and K Obermayer},
  publisher =	 {MIT Press},
  address =	 {Cambridge, MA},
  entered_on =	 {04/13/04},
  year =	 2003,
  pages =	 {343--350},
  pdf =		 {discrete_learning/mcallester-ortiz-03.pdf},
  abstract =	 {This paper gives distribution-free concentration
                  inequalities for the missing mass and the error rate
                  of histogramrules. Negative association methods can
                  be used to reduce these concentration problems to
                  concentration questions about independent
                  sums. Although the sums are independent, they are
                  highly heterogeneous. Such highly heterogeneous
                  independent sums cannot be analyzed using standard
                  concentration inequalities such as Hoeffding's
                  inequality, the Angluin-Valiant bound, Bernstein's
                  inequality, Bennett's inequality, or McDiarmid's
                  theorem.},
  comments =	 {Statistical mechanics-like analysis of the large
                  deviation theory in order to uncover how much of the
                  probability mass is missing after m observations
                  (missing = the outcome has not yet occurred). I do
                  not follow the details, but the paper seems to make
                  some very strong inequalities to prove the
                  inequality of interest (like replacing the
                  occurrence index variables by their inpenedent
                  equivalents, etc.). AFAIK, no way for actually
                  estimating the missing mass is given (and it cannot
                  be given for countably infinite number of
                  bins). Instead the authors prove that in each actual
                  data realization the measured value of the missing
                  mass is not far away from its expectation. In
                  \cite{nsb} we tried to actually estimate the missing
                  mass (which was \sim K\beta), but this was within
                  some class of probability distributions.},
}


@inproceedings{rasmussen-00,
  booktitle =	 {Advances in Neural Information Processing Systems},
  volume =	 12,
  editor =	 {S.A. Solla and T.K. Leen and K.-R. Muller},
  pages =	 {554--560},
  publisher =	 {MIT Press},
  year =	 2000,
  address =	 {Cambridge, MA},
  title =	 {The Infinite Gaussian Mixture Model},
  author =	 {Carl Edward Rasmussen},
  abstract =	 {In a Bayesian mixture model it is not necessary a
                  priori to limit the number of components to be
                  finite. In this paper an infinite Gaussian mixture
                  model is presented which neatly sidesteps the
                  difficult problem of finding the "right" number of
                  mixture components. Inference in the model is done
                  using an efficient parameter-free Markov Chain that
                  relies entirely on Gibbs sampling.},
  url =
                  {http://www.nips.cc/Web/Groups/NIPS/1999/99abstracts.html},
  entered_on =	 {11/03/03},
  pdf =		 {discrete_learning/rasmussen-00.pdf},
  comments =	 {The infinite Dirichlet mixture in this paper
                  actually corresponds to taking the limit of the
                  "alphabet size" of \cite{nsb} to infinity. The
                  dirichlet parameter (\beta of \cite{nsb}) is not
                  being varied or integrated over. Good literature
                  collection. "Predictive distribution" term.},
}

@InProceedings{ristad-98,
  author =	 {Eric Sven Ristad},
  title =	 {A natural law of succession},
  booktitle =	 {ISIT},
  year =	 {1998},
  address =	 {Cambridge, MA},
  month =	 {Aug},
  entered_on =	 {10/30/03},
  pdf =		 {discrete_learning/ristad-98.pdf},
  abstract =	 {We present a new solution to multinomial estimation
                  and demonstrate that our solution outperforms
                  standard solutions both in theory and in
                  practice. The novelty of our approach lies in our
                  use of combinatorial priors on strings.},
  comments =	 {See \cite{ristad-95} for a review.},
}



@TechReport{ristad-95,
  author =	 {Eric Sven Ristad},
  title =	 {A natural law of succession},
  institution =	 {Princeton Univ., Dept of Comp. Sci.},
  year =	 {1995},
  month =	 {May},
  note =	 {CS-TR-495-95},
  entered_on =	 {11/03/03},
  pdf =		 {discrete_learning/ristad-95.pdf},
  abstract =	 {Consider the problem of multinomial estimation. You
                  are given an alphabet of distinct symbols and are
                  told the frequency with which each symbol has
                  occured in the past. On the basis of the information
                  alone, you must now estimate symbol probabilities,
                  In this report, we present a new solution to this
                  fundamental problem in statistics that our solution
                  outperforms standard approaches, both in theory and
                  in practice.},
  comments =	 {The main ideas of the paper are: (1) Assign a priori
                  uniform measure to strings (not to probabilities);
                  this is similar in spirit (but not in realization)
                  to the "uniform prior on entropies" in \cite{nsb};
                  Eq 4 in this paper is obvious for the uniform prior
                  in \cite{nsb}. (2) Sequences are drawn not from full
                  alphabets, but only from their subsets. So one may
                  consider all possible subalphabets of the full
                  alphabet, and put a uniform prior on them. Two
                  variants: either all nonempty subalphabets are a
                  priori equally likely, or all cardinalities of
                  subsets are equally likely (with uniform prior
                  within fixed cardinality). Uniform cardinality prior
                  is equivalent to "selecting K" for fixed uniform
                  prior, $\beta=1$, in \cite{nsb2}. It drives the
                  estimates to low values of K. The uniform
                  subalphabets prior also drives to smaller K;
                  however, a priori probability for cardinalities of
                  roughly K/2 is huge, and it takes time to overcome
                  that and get to small K.  \cite{nsb2} has made a too
                  strong comment in saying that, since effective
                  excess cardinality will tend to 0, the method is not
                  interesting. Apparently, the variance around the
                  most likely value of 0 tends to zero only as 1/N,
                  giving (somehow) a strong 1/N^2 contribution to
                  largeer cardinalities. Still, such fast decrease of
                  probability on unobserved is somewhat
                  problematic. 1911 days of observation with uniform
                  cardinalities (too fast falloff, I think) is
                  equivalent to 1.8e6 observations for Laplace for the
                  sun rising problem, and 1911 NSB observations are
                  equivalent to 23000 Laplace observations. Which one
                  is better -- who knows? Example of the random number
                  generator is yet strager -- nobody wants to predict
                  with probability 1 (NSB would give 1/K prediction,
                  which I actually prefer in this case). This method
                  should work fine when the cardinality of the total
                  space is not too large compared to N, so that
                  non-occurence of some values really means that they
                  are "not in the alphabet". For asymptotically large
                  K this will probably not work. This is manifested in
                  the examples shown at the end of the paper
                  (especially for uniform cardinalities).}
}

@misc{escobar-94,
  author =	 {Michael D.\ Escobar and Mike West},
  title =	 {Bayesian density estimation and inference using
                  mixtures },
  year =	 {1994},
  abstract =	 {We describe and illustrate Bayesian inference in
                  models for density estimation using mixtures of
                  Dirichlet processes. These models provide natural
                  settings for density estimation, and are exemplified
                  by special cases where data are modelled as a sample
                  from mixtures of normal distributions Efficient
                  simulation methods are used to approximate various
                  prior, posterior and predictive distributions. This
                  allows for direct infeence on a variety of practical
                  issues, including priblems of local versus global
                  smoothing, uncertainty about density estimates,
                  assessment of modality, and the inference on the
                  numbers of components. Also, convergence results are
                  established for a general class of normal mixture
                  models.},
  address =	 {Duke University},
  entered_on =	 {08/19/2003},
  pdf =		 {discrete_learning/escobar-west-94.pdf},
  comments =	 {Mixture of Gaussians is discussed, with the pdf over
                  mixtures being of the Dirichlet form. Selecting the
                  number of mixture elements and the Dirichlet
                  parameter is analysed. The references to
                  \cite{ferguson-83} and \cite{antoniak-74} are very
                  useful. The parameter $\alpha$ in this paper
                  corresponds to $\beta*K$ in \cite{nsb}. The paper
                  focuses on the carddinality of the space $K$ being
                  small, and the "mixture" refers to putting some pdf
                  at every discrete point of this space, not a mixture
                  of priors over different smoothing parameters. The
                  prior distribution of the number of mixture elements
                  is basically given by the pdf of having a given
                  number of events out of $N$ to be distinct. The
                  whole model for density estimation is close to
                  kernel smoothing, where one puts one mixture at
                  every sample point (with the weight of this mixture
                  determined by the freqeuency of this point), and
                  then some Dirichlet-derived pdf in between the
                  data. Estimation of the smoothing parameter $\alpha$
                  is done by (sort of) Bayesian posterior averaging
                  similiar to \cite{nsb}. The paper's main point is
                  efficient computations of the posteriors. },
}

@article{antoniak-74,
  author =	 {C.E. Antoniak},
  title =	 {Mixture of {D}irichlet processes with applications
                  to nonparametric problems},
  journal =	 {Ann.\ Stat.},
  year =	 1974,
  volume =	 2,
  pages =	 {1152--1174},
  entered_on =	 {08/19/2003},
  comments =	 {TO_GET. Supposed to have a review of Dirichlet
                  processes.},
}

@incollection{ferguson-83,
  author =	 {T.S.\ Ferguson},
  year =	 1983,
  title =	 {Bayesian density estimation by mixtures of normal
                  distributions},
  booktitle =	 {Recent advances in statistics},
  editor =	 {H. Rizvi and J. Rustagi},
  address =	 {New York},
  publisher =	 {Academic Press},
  pages =	 {287-302},
  entered_on =	 {08/19/2003},
  comments =	 {TO_GET. Supposed to have a review of learning with
                  Dirichlet mixtures.},
}

@misc{raman-00,
  title =	 {The {L}aplace {R}ule of {S}uccession Under A General
                  Prior},
  author =	 {Kalyan Raman},
  address =	 {University of Michigan in Flint,School of
                  Management},
  month =	 {May},
  year =	 {2000},
  entered_on =	 {08/19/03},
  pdf =		 {discrete_learning/raman-00.pdf},
  comments =	 {The paper discusses the generalization of the
                  Laplace's rule for general priors by means of
                  expanding these priors as power series, thus forming
                  a Dirichlet mixture prior (though the author does
                  not use this term). There is almost nothing in the
                  paper beyond \cite{wolpert-wolf-95}.},
}

@Misc{chow-98,
  author =	 {William Chow},
  title =	 {A {D}irichlet process mixture model of overnight
                  {HIBOR}},
  month =	 {December},
  year =	 {1998},
  pdf =		 {discrete_learning/chow-98.pdf},
  entered_on =	 {08/18/2003},
  abstract =	 {This paper applies a class of Bayesian mixture model
                  to the Overnight Hong Kong Interbank Offer Rates
                  (HIBOR), with an intention to identify structural
                  changes in regimes governing the HIBOR process. In
                  so doing, the role of the Hong Kong Monetary
                  Authority (HKMA), the de facto Central Bank of Hong
                  Kong, is also explored. Nonlinearity of the time
                  series is approximated by a locally weighted mixture
                  of linear autoregressions specified with a mixture
                  of Dirichlet processes prior on the mixing
                  parameters of the hierarchical model. It is found
                  that the active involvement of HKMA in the interbank
                  market does have an impact on not only the levels
                  but also the volatility of the HIBOR. However, the
                  effectiveness of interest rate as a means to sustain
                  the HK$/US$ peg rate is much less obvious.},
  comments =	 {I got this paper when searching for Dirichlet
                  mixtures literature for \cite{nsb}. I don't quite
                  understand this paper, but it seems mostly
                  irrelevant for our purposes.},
}


@Article{sjolander-etal-96,
  author =	 {K. Sjolander and K. Karplus and M. Brown and
                  R. Hughey and A. Krogh and I.S. Mian and
                  D. Haussler},
  title =	 {Dirichlet Mixtures: A Method for Improving Detection
                  of Weak but Significant Protein Sequence Homology},
  journal =	 {CABIOS} ,
  volume =	 12,
  number =	 4,
  pages =	 {327--345},
  month =	 {Aug},
  year =	 1996,
  entered_on =	 {10/09/03},
  abstract =	 {This paper presents the mathematical foundations of
                  Dirichlet mixtures, which have been used to improve
                  database search results for homologous sequences,
                  when a variable number of sequences from a protein
                  family or domain are known. We present a method for
                  condensing the information in a protein database
                  into a mixture of Dirichlet densities. These
                  mixtures are designed to be combined with observed
                  amino acid frequencies, to form estimates of
                  expected amino acid probabilities at each position
                  in a profile, hidden Markov model, or other
                  statistical model. These estimates give a
                  statistical model greater generalization capacity,
                  such that remotely related family members can be
                  more reliably recognized by the model. Dirichlet
                  mixtures have been shown to outperform substitution
                  matrices and other methods for computing these
                  expected amino acid distributions in database
                  search, resulting in fewer false positives and false
                  negatives for the families tested. This paper
                  corrects a previously published formula for
                  estimating these expected probabilities, and
                  contains complete derivations of the Dirichlet
                  mixture formulas, methods for optimizing the
                  mixtures to match particular databases, and
                  suggestions for efficient implementation. },
  postscript =	 {discrete_learning/sjolander-etal-96.ps.gz},
  url =
                  {http://www.cse.ucsc.edu/research/compbio/dirichlets/dirichlet-papers.html},
  comments =	 {The abstract/title say it all -- the paper uses
                  finite Dirichlet mixtures (pseudocounts) for
                  calculation of pdf's on protein sequences. The paper
                  has a good review of learning with Dirichlet
                  density, and a good collection of the relevant
                  literature.},
}

@TechReport{karplus-95,
  author =	 {Kevin Karplus},
  title =	 {Regularizers for estimating distributions of
                  aminoacids from small samples},
  institution =	 {UC Santa Cruz, Computer Science Department},
  year =	 1995,
  number =	 {UCSC-CRL-95-11},
  month =	 {March},
  entered_on =	 {08/11/2001},
  abstract =	 {This paper examines several different methods for
                  estimating the distribution of amino acids in
                  specific context, given a very small sample of amino
                  acids from that distribution. These distribution
                  estimators, sometimes called regularizers, are
                  frequently used when aligning sequences to each
                  other, or to models such as profiles or Hidden
                  Markov Models. The distribution estimates considered
                  here are zero-offsets, pseudocounts, substitution
                  matrices (with several variants), feature alphabets,
                  and Dirichlet mixture regularizers. A new method is
                  presented for setting the parameters of the
                  regularizers to minimize the encoding cost (also
                  called the entropy) of the training data, for all
                  possible samples from the training data. The optimal
                  parameter settings depend on the size of the sample,
                  but the optimization method can also be used to get
                  a good performance over a range of sample sizes. The
                  optimal settings for this methods are not the same
                  as the traditional values used for the
                  parameters. The regularizers are evaluated based on
                  how well they estimate the distribution of the
                  columns of a multiple alignment -- specifically, the
                  expected encoding cost per amino acid using the
                  regularizer method and all possible samples from
                  each column. The differences between the
                  regularizers are fairly small (less than 0.2 bits
                  per column) but large enough to make a significant
                  difference when many columns are combined as is done
                  in an alignment. In general, the pseudocounts have
                  the lowest encoding costs for sample of size zero,
                  substitution matrices have the lowest encoding costs
                  for samples of size one, and Dirichlet mixtures have
                  the lowest for larger samples. One of the
                  substitution matrices variants, which added
                  pseudocounts and scaled counts, does almost as well
                  as the best Dirichlet mixtures, but with a lower
                  computation cost.},
  url =		 "discrete_learning/karplus-95.ps",
  comments =	 {The paper reviews (and compares) different methods
                  for estimating extremely undersampled discrete
                  probability distributions (in some cases the number
                  of samples is just a few). The methods are compared
                  by cross-validation type techniques. First the
                  parameters of the estimators are trained on some
                  part of the data, and then the ``quality'' of the
                  method is defined as the empirical entropy of the
                  estimator on the rest of the data. The smaller is
                  the empirical entropy, the smaller are the coding
                  costs, and thus the better the method is. The
                  following regularizers are used. (1) Zero-offset:
                  adding some (the same) count to each
                  bin. Interestingly, the author notes that
                  $offset=1/size of alphabet$ produces the shortest
                  codes. This agrees with
                  \cite{schurmann-grassberger-96}, and with
                  \cite{nsb}. (2) Pseudocounts: adding different
                  zero-offsets to each bin. (3) Substitution matrices:
                  instead of the actual vector of counts, its product
                  with some mixing matrix is used. The matrix
                  ``spreads out'' the counts into the bins which may
                  be considered ``nearby''. (4) Gribskov average score
                  method: similar to ``substitution'', but now the
                  mixing happens in the exponent of the ``new''
                  counts. (5) Substitution matrices plus
                  pseudocounts. (6) Substitution plus pseudocounts
                  plus scalings. Here the new counts are the
                  pseudocounts, plus mixed counts, plus the actual
                  counts multiplied by a scaling function that grows
                  fast enough with the sample size. This method is
                  introduced so that in the limit of large sample size
                  the estimator approaches the ML. (7) Feature
                  alphabets -- most commonly occurring combinations of
                  data become the new words in the extended
                  alphabet. (8) Dirichlet mixtures: mixture with some
                  nontrivial weights of estimators (1) or (2) with
                  different offsets and pseudocounts. If instead of a
                  small number of mixtures (9 or so used in the
                  paper), the integral over all possible mixtures was
                  used, we would get almost the same method as in our
                  \cite{nsb}.}
}


@Proceedings{maxent-88,
  title =	 {Maximum entropy and the Bayesian methods, Cambridge,
                  England, 1988.},
  year =	 {1989},
  editor =	 {J. Skilling},
  volume =	 {8},
  publisher =	 {Kluwer Academic Publishers},
  keywords =	 {MAXENT, maximum entropy},
  entered_on =	 "03/05/2001",
}

@Proceedings{maxent-92,
  title =	 {Maximum entropy and the Bayesian methods, Paris,
                  France, 1992.},
  year =	 {1993},
  editor =	 {A. Mohhamad-Djafari and G. Demoment},
  volume =	 {12},
  publisher =	 {Kluwer Academic Publishers},
  keywords =	 {MAXENT, maximum entropy},
  entered_on =	 "03/05/2001",
}


@Article{schurmann-grassberger-96,
  author =	 {Thomas Schurmann and Peter Grassberger},
  title =	 {Entropy estimation of symbol sequences},
  journal =	 {Chaos},
  volume =	 {6(3)},
  year =	 {1996},
  pages =	 {414--427},
  entered_on =	 "03/05/2001, 09/15/2003",
  url =		 "discrete_learning/schurmann-grassberger-96.pdf",
  keywords =	 "symbol sequences, discrete variables, discrete
                  learning, entropy estimation, Laplace rule, language
                  entropies, Shakespeare, Bible, LOB corpus, Ziv,
                  Lempel, suffix trees, Rissanen's method",
  comments =	 {The paper starts as a nice review of methods for
                  estimation of entropies. The authors notice that the
                  entropy is usually underestimated, and that known
                  correction for this bias depend strongly on correct
                  knowledge of the studied distribution, thus they are
                  mostly useless. A review of various coding
                  (compression) algorithms is provided. Bayesian
                  priors of the type Q^a are discussed and a=-1+1/K
                  are hypothesised to be the best. Unfortunately, when
                  it comes to numerical analysis, the authors only
                  present pictures, but show no data and no details of
                  the fits to see the fits' quality. Even worse, the
                  authors never say which if the many methods to
                  estimate entropies that they have discusses they
                  actually used in their numerics. Specifically, I
                  also disagree with the statement after eq. 7
                  (Harris's correction to naive entropy estimator up
                  to two orders), saying that it's usually easy to
                  estimate M in the (M-1)/2N first order correction
                  with the number of different observed words. See
                  \cite{panzeri-treves-96} and \cite{strong-etal-98}
                  on (very non-trivial) attempts to find this M in
                  real data analysis.},
  abstract =	 {We discuss algorithms for estimating the Shannon
                  entropy h of finite symbol sequences with long range
                  correlations. In particular, we consider algorithms
                  which estimate h from the code lengths produced by
                  some compression algorithm. Our interest is in
                  describing their convergence with sequence length,
                  assuming no limits for the space and time
                  complexities of the compression algorithms. A
                  scaling law is proposed for extrapolation from
                  finite sample lengths. This is applied to sequences
                  of dynamical systems in non-trivial chaotic regimes,
                  a 1-D cellular automaton, and to written English
                  texts. (C) 1996 American Institute of Physics.},
}

@InProceedings{skilling-89,
  author =	 {John Skilling},
  title =	 {Classic maximum entropy},
  booktitle =	 {Maximum entropy and the Bayesian methods, Cambridge,
                  England, 1988.},
  crossref =	 {maxent-88},
  pages =	 {45--52},
  year =	 {1989},
  editor =	 {J. Skilling},
  volume =	 {8},
  publisher =	 {Kluwer Academic Publishers},
  url =		 "maximum_entropy/skilling-89.pdf",
  entered_on =	 "03/05/2001",
  keywords =	 "MAXENT, maximum entropy, Bayesian inference",
  comments =	 "No attempt to determine the smoothness penalty
                  parameter is done. While the arguments that get the
                  factor of {\em sqrt(Q)} in the denominator of the
                  prior look plausible, I do not fully buy them. See
                  also \cite{tikochinsky-etal-84}.",
  abstract =	 {This paper presents a fully Bayesian derivation of
                  maximum entropy image reconstruction. The argument
                  repeatedly goes from the particular to the general,
                  in that {\ul if} there are general theories {\ul
                  then} they must apply to special cases. Two such
                  special cases, formalised as the ``Cox axioms'',
                  lead to the well-known fact that the Bayesian
                  probability theory is the only consistent language
                  of inference. Further cases, formalised as the
                  axioms of maximum entropy, show that the prior
                  probability distribution for any positive, additive
                  distribution must be monotonic in the
                  entropy. Finally, a quantified special case shows
                  that this monotonic function must be exponential,
                  leaving only a single dimensional scaling factor to
                  be determined a posteriori. Many types of
                  distributions, including probability distributions
                  themselves, are positive and additive, so the
                  entropy exponential is very general. The following
                  paper (Gull 1989) applies these ideas to image
                  reconstruction, showing how a sophisticated
                  treatment can incorporate prior expectation of
                  spatial correlations.},
}



@InProceedings{strauss-etal-93,
  author =	 {C. E. Strauss and D. H. Wolpert and D. R. Wolf},
  title =	 {Alpha, evidence and the entropic prior},
  booktitle =	 {Maximum entropy and the Bayesian methods, Paris,
                  France, 1992.},
  crossref =	 {maxent-92},
  pages =	 {113--120},
  year =	 {1993},
  editor =	 {A. Mohhamad-Djafari and G. Demoment},
  volume =	 {12},
  publisher =	 {Kluwer Academic Publishers},
  url =		 "maximum_entropy/strauss-wolpert-wolf-93.pdf",
  entered_on =	 "03/05/2001",
  keywords =	 "MAXENT, maximum entropy, Bayesian inference,
                  Occam factors, discrete learning, discrete
                  variables",
  comments =	 "The authors use the prior {\em
                  P[Q]~exp(aS[Q])/sqrt(Q)} \cite{skilling-89}. For
                  {\em a=0} they calculate the expected value of the
                  entropy for different data
                  (cf. \cite{wolpert-wolf-95}). Useful expression (but
                  no full calculations) are given for other values of
                  {\em a}. Their attack on the ``evidence''
                  approximations for the a posteriori determination of
                  the penalty parameter {\em a} amounts to saying that
                  it is a bad approximation of the Bayes posterior,
                  not that the Bayesian method won't work.",
  abstract =	 {First, the correct entropic prior is computed by
                  marginalization of alpha. This is followed by a
                  discussion of improvements to the ``evidence''
                  approximation. Surprisingly, it appears that the
                  approximation used to restore the famous ``Susie''
                  image may have questionable aspects.},
}


@Article{tikochinsky-etal-84,
  author =	 {Y. Tikochinsky and N. Z. Tishby and R. D. Levine},
  title =	 {Consistent inference of probabilities for
                  reproducible experiments.},
  journal =	 {Phys. Rev. Lett.},
  year =	 {1984},
  volume =	 {52},
  number =	 {16},
  pages =	 {1357--1360},
  month =	 {April},
  entered_on =	 "03/05/2001",
  url =		 "maximum_entropy/tikochinsky-tishby-levine-84.pdf",
  keywords =	 "MAXENT, maximum entropy",
  comments =	 "More will be added soon. See also
                  \cite{skilling-89}.",
  abstract =	 {The need for inducing a probability distribution
                  from partial data and the complementary problem of
                  the analysis of an observed distribution in terms of
                  fewer relevant variables occur in many branches of
                  physics. For reproducible experiments, consistency
                  conditions which must be satisfied by any algorithm
                  for inferring a discrete probability distribution
                  with given averages are formulated. The only
                  consistent algorithm is the one leading to the
                  distribution of maximal entropy subject to the given
                  constraints.},
}

@Article{wolpert-wolf-95,
  author =	 "D. H. Wolpert and D. R. Wolf",
  title =	 "Estimating functions of probability distributions
                  from a finite set of samples",
  journal =	 "Phys. Rev. E",
  volume =	 "52(6)",
  pages =	 "6841-6854",
  year =	 1995,
  entered_on =	 "03/05/2001",
  keywords =	 "_NEW_, discrete variables, discrete learning,
                  entropy estimation, Bayesian inference, flat prior",
  url =		 "discrete_learning/wolpert-wolf-95.pdf",
  comments =	 {The authors develop a complete formalism for
                  calculating expectation values of quantities that
                  are expressible in terms of sums and products of Q
                  (the unknown distribution). The priors used must be
                  of the same form. Throughout the paper they actually
                  use the prior which is flat on the space of
                  Q's. Erratum available
                  \cite{wolpert-wolf-95-errata}. Note that even the
                  erratum has not corrected a mistake in Theorem
                  8. The first argument of the second function $\Delta
                  \Phi ^{(1)}$ in the first line of the expression for
                  $s_2/s_0$ should read $n_j+2$ instead of $n_j+1$.},
  abstract =	 {This paper addresses the problem of estimating a
                  function of a probability distribution from a finite
                  set of samples of that distribution. A Bayesian
                  analysis of this problem is presented, the optimal
                  properties of the Bayes estimators are discussed,
                  and as an example of the formalism, closed form
                  expressions for the Bayes estimators for the moments
                  of the Shannon entropy function are derived. Then
                  numerical results are presented that compare the
                  Bayes estimator to the frequency-counts estimator
                  for the Shannon entropy. We also present the closed
                  form estimators, all derived elsewhere, for the
                  mutual information, chi(2) covariance, and some
                  other statistics.},
}

@Article{wolpert-wolf-95-II,
  author =	 "D. H. Wolpert and D. R. Wolf",
  title =	 "Estimating functions of probability distributions
                  from a finite set of samples. Part II: Bayes
                  estimators for mutual information, chi-squared,
                  covariance, and other statistics",
  journal =	 "arXiv",
  volume =	 "/comp-gas/",
  pages =	 9403002,
  year =	 1994,
  entered_on =	 "03/05/2001",
  url =		 "http://arXiv.org/abs/comp-gas/9403002",
  keywords =	 "discrete variables, discrete learning, entropy
                  estimation, Bayesian inference, flat prior",
  comments =	 {The second part of
                  \cite{wolpert-wolf-95}. Calculations for more
                  difficult cases done here. Unless details of
                  calculations are of interest, only the first part
                  should be read.},
  abstract =	 {We present estimators for entropy and other
                  functions of a discrete probability distribution
                  when the data is a finite sample drawn from that
                  probability distribution. In particular, for the
                  case when the probability distribution is a joint
                  distribution, we present finite sample estimators
                  for the mutual information, covariance, and
                  chi-squared functions of that probability
                  distribution. },
}

@Article{wolpert-wolf-95-errata,
  author =	 "D. H. Wolpert and D. R. Wolf",
  title =	 "Estimating functions of probability distributions
                  from a finite set of samples ({\bf errata:} vol 52,
                  pg 6841, 1995).",
  journal =	 "Phys. Rev. E.",
  volume =	 "54(6)",
  pages =	 "6973--6973",
  year =	 "1996",
  entered_on =	 "03/05/2001",
  url =		 "discrete_learning/wolpert-wolf-95-errata.pdf",
  comments =	 "Really minor errors.",
  keywords =	 "errata, discrete variables,
                  discrete learning, entropy estimation, Bayesian
                  inference, flat prior",
}

@article{israeli-goldenfeld-04,
  title =	 {Computational Irreducibility and the predictability
                  of complex physical systems},
  author =	 {N Israeli and N Goldenfeld},
  abstract =	 {Using elementary cellular automata (CA) as an
                  example, we show how to coarse grain CA in all
                  classes of Wolframs classification. We find that
                  computationally irreducible physical processes can
                  be predictable and even computationally reducible at
                  a coarse-grained level of description. The resulting
                  coarse-grained CAwhich we construct emulate the
                  large-scale behavior of the original systems without
                  accounting for small-scale details. At least one of
                  the CA that can be coarse grained is irreducible and
                  known to be a universal Turing machine. },
  journal =	 {Phys. Rev. Lett.},
  year =	 2004,
  volume =	 92,
  pages =	 {074105},
  pdf =		 {dynamical_systems/israeli-goldenfeld-04.pdf},
  comments =	 {The authors coarse-grain CAs in such a way that
                  coarse grain (evolution) = evolution (coarse
                  grain). That is, they want to stay within the
                  deterministic CA framework. I think it might be
                  interesting to coarse grain into probabilistic rules
                  -- simpli remove rows or columns and average over
                  them. This should take us closer to physical
                  continuous probabilistic models.},
  entered_on =	 {10/07/2004},
}


@article{eubank-etal-04,
  title =	 {Modelling disease outbreaks in realistic urban
                  social networks},
  author =	 {Stephen Eubank and Hasan Guclu and VS Anlis Kumar
                  and Madhav V. Marathe and Aravind Srinivasa and
                  Zoltan Toroczkai and Nan Wang},
  abstract =	 {Most mathematical models for the spread of disease
                  use differential equations based on uniform mixing
                  assumptions or ad hoc models for the contact
                  process. Here we explore the use of dynamic
                  bipartite graphs to model the physical contact
                  patterns that result from movements of individuals
                  between specific locations. The graphs are generated
                  by large-scale individual-based urban traffic
                  simulations built on actual census, land-use and
                  population-mobility data. We find that the contact
                  network among people is a strongly connected
                  small-world-like graph with a well-defined scale for
                  the degree distribution. However, the locations
                  graph is scale-free, which allows highly efficient
                  outbreak detection by placing sensors in the hubs of
                  the locations network. Within this large-scale
                  simulation framework, we then analyse the relative
                  merits of several proposed mitigation strategies for
                  smallpox spread. Our results suggest that outbreaks
                  can be contained by a strategy of targeted
                  vaccination combined with early detection without
                  resorting to mass vaccination of a population.},
  pdf =		 {dynamical_systems/eubank-etal-04.pdf},
  entered_on =	 {05/23/04},
  comments =	 {You gotta love scale-free plots over 0.5 decades! By
                  huge simulations this paper says to us: if you want
                  to stop diseases, vaccinate (or kill) those who
                  travel far. Is this really new? People new this in
                  the middle ages, I guess.},
  journal =	 {Nature},
  volume =	 429,
  number =	 6988,
  pages =	 {180--183},
  year =	 2004,
}

@article{eckmann-ruelle-92,
  author =	 {J.-P. Eckmann and D. Ruelle},
  title =	 {Fundamental limitations for estimating dimensions
                  and Lyapunov exponents in dynamical systems},
  journal =	 {Physica D},
  year =	 1992,
  volume =	 56,
  pages =	 {185--187},
  entered_on =	 {10/31/03},
  abstract =	 {We show that values of the correlation dimension
                  estimated over a decade from the
                  Grassberger-Procaccia algoritm cannot exceed the
                  value of 2\log_10 N if N is the number of points in
                  the time series. When this bound is saturated it is
                  thus not legitimate to conclude that low dimensional
                  dynamics is present. The estiation of Lyapunov
                  exponents is also discussed.},
  pdf =		 {dynamical_systems/eckmann-ruelle-92.pdf},
  comments =	 {While the conclusion of the paper (that one should
                  carefully check what he believes he derived) is
                  correct. But I would be worried to blatantly apply
                  this bound.}
}


@article{cohen-procaccia-85,
  author =	 {A. Cohen and I. Procaccia},
  title =	 {Computing the Kolmogorov entropy from time signals
                  of dissipative and conservative dynamical systems},
  journal =	 {Phys. Rev. A},
  volume =	 31,
  pages =	 {1872--1882},
  year =	 1985,
  entered_on =	 {08/18/2001},
  abstract =	 {The extraction of the Kolmogorov (metric) entropy
                  from an experimental time signal is
                  discussed. Theoretically we stress the concept of
                  generators and that the existence of an expansive
                  constant guarantees that a finite-time series would
                  be sufficient for the calculation of the metric
                  entropy. On the basis of the theory we attempt to
                  propose optimal algorithms which are tested on a
                  number of examples. The approach is applicable to
                  both dissipative and conservative dynamical
                  systems.},
  url =		 {dynamical_systems/cohen_procaccia_85.pdf},
  keywords =	 {discretization},
  comments =	 {This paper deals with estimation of entropy rates by
                  methods similar to those in
                  \cite{grassberger-procaccia-83}. The methods are
                  better developed, proofs are given, effects of
                  partitioning are discussed, and more examples are
                  shown. Unlike the \cite{grassberger-procaccia-83},
                  this paper deals with Kolmogorov-Shannon (and not
                  Renyi's) entropies.}
}

@article{ebeling-etal-96,
  author =	 {W. Ebeling and J. Freund and K. Rateitschak},
  journal =	 {Int. J. Bifurcat. and Chaos},
  title =	 {UNKNOWN},
  year =	 1996,
  volume =	 6,
  pages =	 611,
  entered_on =	 {08/15/2001},
  comments =	 {A derivation of the subextensive entropic correction
                  for the logistic map should be in this article
                  according to \cite{ebeling-97}.}
}

@article{ebeling-nicolis-92,
  author =	 {W. Ebeling and G. Nicolis},
  journal =	 {Chaos, Solitons, and Fractals},
  volume =	 2,
  title =	 {Word frequence and entropy of symbolic sequences: a
                  dynamical perspective},
  year =	 1992,
  pages =	 635,
  entered_on =	 {08/15/2001},
  keywords =	 {TO_GET},
  comments =	 {A derivation of the subextensive entropic correction
                  for the logistic map should be in this article
                  according to \cite{ebeling-97}.}
}

@article{grassberger-procaccia-83,
  author =	 {P. Grassberger and I. Procaccia},
  title =	 {Estimation of the Kolmogorov entropy from a chaotic
                  signal},
  journal =	 {Phys. Rev. A},
  volume =	 28,
  pages =	 {2591--2593},
  year =	 1983,
  entered_on =	 {08/18/2001},
  abstract =	 {A new method for estimating the Kolmogorov entropy
                  directly from a time signal is proposed and tested
                  on examples. The method should prove valuable for
                  characterizing experimental chaotic signals.},
  url =		 {dynamical_systems/grassberger_procaccia_83.pdf},
  comments =	 {The paper suggests a method for estimation of the
                  Renyi (mostly, order 2) entropy rate of a chaotic
                  process (on a continuous space) as a difference
                  between n- and n-1-gram entropies as $n \to
                  \infty$. The R2 entropies are the easiest to
                  estimate, and they bound the Shannon entropy from
                  below; for most examples discussed in the paper, the
                  bounds are actually rather tight. The R2 entropies
                  in their turn are to be determined by counting
                  near-coincidences (points closer than $\epsilon$
                  from each other) in the n-gram space, and then
                  taking the coincidence threshold $\epsilon$ to
                  zero.},
},

@article{kantz-schurmann-96,
  title =	 {Enlarged scaling ranges for the KS-entropy and the
                  information dimension},
  author =	 {Holger Kantz and Thomas Schurmann},
  journal =	 {Chaos},
  volume =	 6,
  number =	 2,
  year =	 1996,
  pages =	 {167--171},
  entered_on =	 {08/18/2001},
  abstract =	 {Numerical estimates of the Kolmogorov-Sinai entropy
                  based on a finite amount of data decay towards zero
                  in the relevant limits. Rewriting differences of
                  block entropies as averages over decay rates, and
                  ignoring all parts of the sample where these rates
                  are uncomputable because of the lack of neighbours,
                  yields improved entropy estimates. In the same way,
                  the scaling range for estimates of the information
                  dimension can be extended considerably. The
                  improvement is demonstrated for experimental data.},
  url =		 {dynamical_systems/kantz_schurmann_96.pdf},
  comments =	 {The paper presents a rather ad-hoc method for
                  dealing with undersampling problems in the entropy
                  rate estimation method suggested in
                  \cite{cohen-procaccia-85}. Since points with no
                  neighbors (= near-coincidences) contribute the same
                  to the entropies of n-grams and n-1-grams, they do
                  not contribute to the rate estimation. Thus it is
                  suggested that only points with more than some fixed
                  small number of $\epsilon$-neighbors are to be
                  included in entropy estimations. This leads to
                  underrepresentation of low-probability regions in
                  the entropy, but, according to the numerics done by
                  the authors, the error is small and statistical, but
                  not systematic. Similar estimation may be used to
                  find the information dimensions.},
}

@article{rateitschak-etal-95,
  author =	 {K. Rateitschak and J. Freund and W. Ebeling},
  title =	 {Dynamic entropy and long range correlations in
                  nonlinear processes},
  journal =	 {Int. J. Bifurcat. and Chaos},
  booktitle =	 {Entropy and entropy generation},
  editor =	 {J. Shiner},
  year =	 1996,
  publisher =	 {Kluwer},
  address =	 {Dordrecht},
  entered_on =	 {08/15/2001},
  comments =	 {A derivation of the subextensive entropic correction
                  for the logistic map should be in this article
                  according to \cite{ebeling-97}.}
}@InProceedings{hutter-02a,
  author =	 "M. Hutter",
  title =	 "Distribution of Mutual Information",
  booktitle =	 "Advances in Neural Information Processing Systems
                  14",
  editor =	 "T. G. Dietterich and S. Becker and Z. Ghahramani",
  publisher =	 "MIT Press",
  address =	 "Cambridge, MA",
  year =	 "2002",
  entered_on =	 {08/01/05},
  postscript =	 {entropy/hutter-02.ps},
  comments =	 {Estimation of MI using Dirichlet priors; Wolpert and
                  Wolf have a better discussion.},
  abstract =	 {The mutual information of two random variables i and
                  j with joint probabilities tij is commonly used in
                  learning Bayesian nets as well as in many other
                  fields. The chances tij are usually estimated by the
                  empirical sampling frequency nij/n leading to a
                  point estimate I(nij/n) for the mutual
                  information. To answer questions like "is I(nij/n)
                  consistent with zero?" or "what is the probability
                  that the true mutual information is much larger than
                  the point estimate?" one has to go beyond the point
                  estimate. In the Bayesian framework one can answer
                  these questions by utilizing a (second order) prior
                  distribution p(t) comprising prior information about
                  t. From the prior p(t) one can compute the posterior
                  p(t|n), from which the distribution p(I|n) of the
                  mutual information can be calculated. We derive
                  reliable and quickly computable approximations for
                  p(I|n). We concentrate on the mean, variance,
                  skewness, and kurtosis, and non-informative
                  priors. For the mean we also give an exact
                  expression. Numerical issues and the range of
                  validity are discussed. },
}
@article{kennel-etal-05,
  volume =	 {17},
  number =	 7,
  year =	 2005,
  title =	 {Estimating Entropy Rates with Bayesian Confidence
                  Intervals},
  author =	 {M Kennel and J Shlens and H Abarbanel and E
                  Chichilnisky},
  pages =	 {1531 - 1576},
  journal =	 {Neural Comp.},
  abstract =	 {The entropy rate quantifies the amount of
                  uncertainty or disorder produced by any dynamical
                  system. In a spiking neuron, this uncertainty
                  translates into the amount of information
                  potentially encoded and thus the subject of intense
                  theoretical and experimental
                  investigation. Estimating this quantity in observed,
                  experimental data is difficult and requires a
                  judicious selection of probabilistic models,
                  balancing between two opposing biases. We use a
                  model weighting principle originally developed for
                  lossless data compression, following the minimum
                  description length principle. This weighting yields
                  a direct estimator of the entropy rate, which,
                  compared to existing methods, exhibits significantly
                  less bias and converges faster in simulation. With
                  Monte Carlo techniques, we estimate a Bayesian
                  confidence interval for the entropy rate. In related
                  work, we apply these ideas to estimate the
                  information rates between sensory stimuli and neural
                  responses in experimental data (Shlens, Kennel,
                  Abarbanel, & Chichilnisky, in preparation).},
  pdf =		 {entropy/kennel-etal-05.pdf},
}

@article{paninski-03,
  title =	 {Estimation of Entropy and Mutual Information},
  author =	 {Liam Paninski},
  abstract =	 {We present some newresults on the nonparametric
                  estimation of entropy and mutual information. First,
                  we use an exact local expansion of the entropy
                  function to prove almost sure consistency and
                  central limit theorems for three of the most
                  commonly used discretized information
                  estimators. The setup is related to Grenanders
                  method of sieves and places no assumptions on the
                  underlying probability measure generating the
                  data. Second, we prove a converse to these
                  consistency theorems, demonstrating that a
                  misapplication of the most common estimation
                  techniques leads to an arbitrarily poor estimate of
                  the true information, even given unlimited
                  data. This inconsistency theorem leads to an
                  analytical approximation of the bias, valid in
                  surprisingly small sample regimes and more accurate
                  than the usual 1N formula of Miller and Madowover a
                  large region of parameter space. The two most
                  practical implications of these results are
                  negative: (1) information estimates in a certain
                  data regime are likely contaminated by bias, even if
                  bias-corrected estimators are used, and (2)
                  confidence intervals calculated by standard
                  techniques drastically underestimate the error of
                  the most common estimation methods. Finally, we note
                  a very useful connection between the bias of entropy
                  estimators and a certain polynomial approximation
                  problem. By casting bias calculation problems in
                  this approximation theory framework, we obtain the
                  best possible generalization of known asymptotic
                  bias results. More interesting, this framework leads
                  to an estimator with some nice properties: the
                  estimator comes equipped with rigorous bounds on the
                  maximum error over all possible underlying
                  probability distributions, and this maximum error
                  turns out to be surprisingly small. We demonstrate
                  the application of this new estimator on both real
                  and simulated data. },
  journal =	 {Neural Comp.},
  volume =	 15,
  pages =	 {1191-1253},
  year =	 2003,
  pdf =		 {entropy/paninski-03.pdf}
}
@article{ma-81,
  author =	 {S.~Ma},
  journal =	 {J.\ Stat.\ Phys.},
  volume =	 26,
  pages =	 {221--240},
  year =	 1981,
  title =	 {Calculation of entropy from data of motion},
  comments =	 {Ma entropy bound, birthday problem, etc.},
}

@article{kraskov-etal-03,
  journal =	 {Phys. Rev. E},
  volume =	 69,
  pages =	 066138,
  year =	 2004,
  howpublished = {arXiv: cond-mat/0305641},
  title =	 {Estimating Mutual Information},
  author =	 {Alexander Kraskov and Harald Stoegbauer and Peter
                  Grassberger},
  abstract =	 {We present two classes of improved estimators for
                  mutual information $M(X,Y)$, from samples of random
                  points distributed according to some joint
                  probability density $\mu(x,y)$. In contrast to
                  conventional estimators based on binnings, they are
                  based on entropy estimates from $k$-nearest
                  neighbour distances. This means that they are data
                  efficient (with $k=1$ we resolve structures down to
                  the smallest possible scales), adaptive (the
                  resolution is higher where data are more numerous),
                  and have minimal bias. Indeed, the bias of the
                  underlying entropy estimates is mainly due to
                  non-uniformity of the density at the smallest
                  resolved scale, giving typically systematic errors
                  which scale as functions of $k/N$ for $N$
                  points. Numerically, we find that both families
                  become {\it exact} for independent distributions,
                  i.e. the estimator $\hat M(X,Y)$ vanishes (up to
                  statistical fluctuations) if $\mu(x,y) = \mu(x)
                  \mu(y)$. This holds for all tested marginal
                  distributions and for all dimensions of $x$ and
                  $y$. In addition, we give estimators for
                  redundancies between more than 2 random
                  variables. We compare our algorithms in detail with
                  existing algorithms. Finally, we demonstrate the
                  usefulness of our estimators for assessing the
                  actual independence of components obtained from
                  independent component analysis (ICA), for improving
                  ICA, and for estimating the reliability of blind
                  source separation.},
  pdf =		 {entropy/kraskov-stogbauer-grassberger-03.pdf },
  entered_on =	 {05/12/04},
  comments =	 {Good collection of entropy/MI estimation references
                  for continuous distributions. Correct behavior of
                  the estimators for almost independent variables
                  suggests that the estimators have biases which
                  cancel if S(1)+S(2)=S(12). Figure 2 shows huge
                  biases. What should k be? Eq. 18 assumes uniformity
                  over epsilon-ball, which means some bias in the
                  mutual information (upwards bias in entropy, unknown
                  bias for MI). Good comment on cancelling biases
                  after eq 21. Biases! Maybe one use Strong-like
                  correction (scalings on Fig 1 seem to say that error
                  is >> 2^I/N)? What should be a good starting
                  reparameterization of the data? etc. many of the
                  questions are just glanced over. Numerical examples
                  are all Gaussians, which is not ideal. What are the
                  errors of the estimators?}
}

@article{fienberg-etal-99,
  author =	 {S Fienberg and M Johnson and B Junker},
  year =	 1999,
  journal =	 {J. Roy. Stat. Soc. (Ser. A)},
  volume =	 162,
  number =	 3,
  title =	 {Classical multilevel and {B}ayesian approaches to
                  population size estimation using multiple lists},
  pages =	 {383--405},
  entered_on =	 {04/23/04},
  pdf =		 {entropy/fienberg-etal-99.pdf},
  abstract =	 {One of the major objections to the standard
                  multiple-recapture approach to population estimation
                  is the assumption of homegeneity of individual
                  `capture' probabilities. Modelling individual
                  capture heterogeneity is complicated by the fact
                  that it shows up as a restricted form of interaction
                  among lists in the contingency table
                  cross-classifying list memberships for all
                  individuals. Traditional log-linear modelling
                  approaches to capture-recapture problems are well
                  suited to modelling interactions among lists but
                  ignore the special dependence structure that
                  individual heterogeneity induces. A random-effects
                  approach, based on the rasch model from educational
                  testing and introduced in this context by Darrocj
                  and co-workers and Agresti, provides one way to
                  introduce the dependence resulting from
                  heterogeneity into the log-linear model; however,
                  previous efforts to combine the Rasch-like
                  heterogeneity terms additively with the usual
                  log-linear interaction terms suggest that a more
                  flexible approach is required. In this paper we
                  consider both classical multilevel approaches and
                  full Bayesian hierarchical approaches to modelling
                  individual heterogeneity and list interactions. Our
                  framework encompasses both the traditional
                  log-linear approach and various elements from the
                  full Rasch model. We compare these approaches on two
                  examples, the first arising from an epidemiological
                  study of a population of diabetics in Italy, and the
                  second a study intended to assess the `size' of the
                  World Wide Web. We also explore extensions allowing
                  for interactions between the Rasch and log-linear
                  portions of the models in both the classical and teh
                  Bayesian context.},
  comments =	 {Good paper. Very good collection of references. The
                  problem addresses here is the following: suppose we
                  generate lists (samples) from a discrete set. Each
                  element is allowed to happen only once. Many such
                  lists are generated. Some outcomes occur in more
                  than one list, some do not occur at all. The
                  question is: what is the total number of possible
                  outcomes? This is the problem related to estimating
                  the entropy of the underlying distribution by the Ma
                  method, or by \cite{nsb}. It gets more difficult if
                  probabilites of being in a list depend on the list
                  (that is, the distribution over the set changes from
                  list to list). If the probabilities to appear on the
                  list are uniform and do not change from list to
                  list, then Ma solution estimates the population
                  size. If the probabilities are not uniform, but stil
                  do not change, NSB may estimate the entropy (just
                  lump all data together), but estimating the size is
                  difficult. For changing probabilities, we have very
                  little to say. Comment on beginning of pg 387: of
                  course, if we do not account for dependence between
                  the lists, we underestimate their entropy (and thus
                  the number). Paper talks about multiway interactions
                  in the context of the log-linear binary models. The
                  regime they work on is well sampled proability
                  distributions, probably not very relevant to the
                  regime in which we need to estimate entropies.}
}

@Unpublished{paninski-04,
  author =	 {Liam Paninski},
  title =	 {Estimating entropy on m bins given fewer than m
                  samples},
  note =	 {submitted to IEEE Trans.\ Inf.\ Thy.},
  url =		 {http://www.cns.nyu.edu/~liam/info_est.html},
  pdf =		 {entropy/paninski-04.pdf},
  year =	 {2004},
  entered_on =	 {02/15/04},
  abstract =	 {Consider a sequence p_N of discrete probability
                  measures, supported on m_N points, and assume that
                  we observe N i.i.d. samples from each p_N. We
                  demonstrate the existence of an estimator of the
                  entropy, H(p_N), which is consistent even if the
                  ratio N/m_N is bounded (and, as a corollary, even if
                  thisration tends to zero, albeit at a sufficiently
                  slow rate).},
  comments =	 {A development of \cite{paninski-03}.},
}

@article{kennel-mees-02,
  title =	 {Context-tree modeling of observed symbolic dynamics},
  author =	 {Matthew B. Kennel and Alistair I. Mees},
  year =	 2002,
  journal =	 {Phys.\ Rev.\ E},
  volume =	 66,
  pages =	 {056209},
  abstract =	 {Modern techniques invented for data compression
                  provide efficient automated algorithms for the
                  modeling of the observed symbolic dynamics.We
                  demonstrate the relationship between coding and
                  modeling, motivating the well-known minimum
                  description length ~MDL! principle, and give
                  concrete demonstrations of the "context-tree
                  weighting" and "context-tree maximizing"
                  algorithms. The predictive modeling technique
                  obviates many of the technical difficulties
                  traditionally associated with the correct MDL
                  analyses. These symbolic models, representing the
                  symbol generating process as a finite-state
                  automaton with probabilistic emission probabilities,
                  provide excellent and reliable entropy
                  estimations. The resimulations of estimated tree
                  models satisfying the MDL model-selection criterion
                  are faithful to the original in a number of
                  measures. The modeling suggests that the automated
                  context-tree model construction could replace
                  fixed-order word lengths in many traditional forms
                  of empirical symbolic analysis of the data. We
                  provide an explicit pseudocode for implementation of
                  the context-tree weighting and maximizing
                  algorithms, as well as for the conversion to an
                  equivalent Markov chain.},
  pdf =		 {entropy/kennel-mees-02.pdf},
  entered_on =	 {01/11/04},
  keywords =	 {SHLENS_NIPS03},
  comments =	 {Developments related to
                  \cite{willems-etal-95,willems-98}. Paper has a few
                  results on estimating entropies of various symbolic
                  sequences by the discussed CTW method (Fig 3 is
                  particularly interesting, as the estimator seems to
                  have no bias for quite low N, much better than LZ
                  type algorithms), but it looks to me that these
                  estimates also happen for the "asymptotic" regime of
                  K/N\to 0. Notice Fig 4 in this paper -- it looks
                  like these types of methods fail if there is a
                  divergent subextensive component to the entropy of a
                  symbolic sequence. When writing a review, I should
                  lookup this paper for some nice comments related to
                  entropy in dynamical systems, particularly
                  generative models of dynamical systems. For
                  simulations, they use \beta=1. This is reasonable,
                  as explained in text, but are there any covergence
                  bounds in the spirit of \cite{willems-etal-95} for
                  such \beta? Probably not. For table 3: distribution
                  of each sequence matches between the original model,
                  and the CTW estimate. But does the joint
                  distribution for all of them also match? Finally,
                  the discussion at the end of the paper discussed
                  questions that are, in my view, not very relevant
                  for biological systems.},
}


@TechReport{batu-etal-02,
  author =	 {Tugkan Batu and Sanjoy Dasgupta and Ravi Kumar and
                  Ronitt Rubinfeld},
  title =	 {The complexity of approximating the entropy},
  institution =	 {NEC Research Institute},
  year =	 {2002},
  address =	 {Princeton, NJ},
  postscript =	 {entropy/batu-etal-02.ps.gz},
  entered_on =	 {10/29/03},
  abstract =	 {We consider the problem of approximating the entropy
                  of a discrete distribution under several models. If
                  the distribution is given explicitly as an array
                  where the i-th location is the probability of the
                  i-th element, then linear time is both necessary and
                  sufficient for approximating the entropy.We consider
                  a model in which the algorithm is given access only
                  to independent samples from the distribution. Here,
                  we show that a \gamma-multiplicative approximation
                  to the entropy can be obtained in
                  O\left(n^{(1+\eta)/\gamma^2} \poly(\log n)\right)
                  time for distributions with entropy
                  \Omega(\gamma/\eta), where n is the size of the
                  domain of the distribution and \eta is an
                  arbitrarily small positive constant. We show that
                  one cannot get a multiplicative approximation to the
                  entropy in general in this model. Even for the class
                  of distributions to which our upper bound applies,
                  we obtain a lower bound of
                  \Omega\left(n^{\max(1/(2\gamma^2),2/(5\gamma^2-2))}
                  \right).We next consider a hybrid model in which
                  both the explicit distribution as well as
                  independent samples are available. Here,
                  significantly more efficient algorithms can be
                  achieved: a \gamma-multiplicative approximation to
                  the entropy can be obtained in O
                  \left(\frac{\gamma^2 \log^2{n}}{h^2 (\gamma-1)^2}
                  \right) time for distributions with entropy
                  \Omega(h); we show a lower bound of \Omega
                  \left(\frac{\log n}{h(\gamma^2-1)} \right).Finally,
                  we consider two special families of distributions:
                  those for which the probability of an element
                  decreases monotonically in the label of the element,
                  and those that are uniform over a subset of the
                  domain. In each case, we give more efficient
                  algorithms for approximating the entropy. },
  comments =	 {The way approximation of entropies is done in this
                  paper is by partitioning "bins" into the ones with
                  low and high probability mass, and then estimating
                  entopy of each part independently. Estimating
                  entropy of bins with mass larger than 1/K^a requires
                  K^a samples, and the smaller bins have their entropy
                  bounded by 1/K^a log(K). This requires that the
                  entropy is, at least, greater than some constant
                  value, otherwise two distributions with small, but
                  different, entropies are difficult to
                  distinguish. This assumption is not a big problem,
                  however, since estimating small entropies is easy
                  (not up to a multiplicative, but up to an additive
                  factor). The paper proves that, in order to
                  guarantee approximation with the multiplicative
                  factor of 1, one need at least K*(log(K))^(some
                  power) samples (in terminology of \cite{nsb}), that
                  is, good sampling is needed. This result is, proven
                  by showing that one can create distributions with
                  same large elements and different small elements,
                  such that the total entropy is different, while any
                  reasonable observed statistics is the same. The
                  paper says nothing about additive estimators, which
                  might be more interesting.},
}

@Misc{wyner-foster-03,
  author =	 {Abraham J. Wyner and Dean Foster},
  title =	 {On the lower limits of entropy estimation},
  howpublished = {Preprint},
  month =	 {June},
  year =	 {2003},
  entered_on =	 {10/14/03},
  pdf =		 {entropy/wyner-foster-03.pdf},
  abstract =	 {In recent paper, Antos and Kontoyiannis [1]
                  considered the problem of estimating the entropy of
                  a countably infinite discrete distribution from
                  independent identically distributed
                  observations. They left several open problems
                  regarding the convergence rates of entropy
                  estimates, which we consider here. Our first result,
                  is that the plug-in estimate of entropy is as
                  effcient as a match length estimator when applied to
                  the class of memoryless sources on a countable
                  alphabet with finite entropy and finite entropy
                  "variance". Our second result provides lower bounds
                  on the convergence rate of any sequence of universal
                  estimators of entropy over the same class of
                  distributions. Finally, we consider an estimator
                  based on match lengths that achieves this lower
                  bound to first order. The surprising conclusion that
                  follows is that a match-length estimator is first
                  order optimal over the simplest class of
                  distributions for which entropy estimation is
                  non-trivial. We describe how this in turn implies
                  that the Lempel-Ziv algorithm has an optimal
                  convergence rate among the class of universal data
                  compression algorithms over the arguably simplest
                  class of non-trivial sources. },
  comments =	 {The paper builds on \cite{antos-kontoyiannis-02} and
                  analyzes convergence rates for plug-in and
                  matched-strings estimators of entropies on infinite
                  countable alphabets for finite memory and memoryless
                  processes . Useful asymptotics are
                  achieved. Further, a bunch of "no-go" theorems are
                  proven showing that it is impossible to estimate
                  entropies in a "universally consistent" way with the
                  convergence rates independent of the underlying
                  (unknown) probablity distribution even for strong
                  assumptions (like, for example, finite entropy and
                  its variance). Even more, it's proven that for each
                  universally consistent estimator there always exists
                  a bad distribution, on which the estimator converges
                  to the right entropy extremely slowly (only roughly
                  as $1/\log(N)$. Proofs are done by explicit
                  constructions.},
}

@InCollection{jimenezmontano-etal-02,
  author =	 {Miguel Angel Jimenez-Montano and Werner Ebeling and
                  Thorsten Poeschel },
  title =	 {SYNTAX: A computer program to compress a sequence
                  and to estimate its information content },
  editor =	 {R. Lopez-Pena, H. Waelbroeck und F. Zertuche},
  booktitle =	 {Guanajuato Lectures, Mexico, 16 - 22 January 1995},
  publisher =	 {Springer},
  address =	 {New York},
  year =	 1995,
  url =		 {http://xxx.lanl.gov/abs/cond-mat/0204134},
  pdf =		 {entropy/jimenez-montano-etal-02.pdf},
  entered_on =	 {10/08/03},
  abstract =	 {The determination of block-entropies is a well
                  established method for the investigation of discrete
                  data, also called symbols (7). There is a large
                  variety of such symbolic sequences, ranging from
                  texts written in natural languages, computer
                  programs, neural spike trains, and biosequences. In
                  this paper a new algorithm to construct a short
                  context-free grammar (also called program or
                  description) that generates a given sequence is
                  introduced. It follows the general lines of a former
                  algorithm, employed to compress biosequences (1,2)
                  and to estimate the complexity of neural spike
                  trains (4), which uses as valuation function the, so
                  called, grammar complexity (2). The new algorithm
                  employs the (observed) block-entropies instead. A
                  variant, which employs a corrected "observed
                  entropy", as discussed in (7) is also described. To
                  illustrate its usefulness, applications of the
                  program to the syntactic analysis of a sample
                  biological sequences (DNA and RNA) is presented. },
  comments =	 {The paper introduces the pair replacement algorithm
                  (replace the most frequent pair of symbols with a
                  new symbol) and studies it. The termination point
                  for any new replacements is the point when the {\em
                  total} description length of the string, including
                  the preamble, starts to grow. This is in spirit of
                  MDL. Unfortunately, according to
                  \cite{grassberger-02}, the actual algorithm proposed
                  is not optimal and not uniquely decodable
                  (Grassberger corrects this), but it still can be
                  used for entropy estimation.},
},



@Misc{grassberger-02,
  author =	 {Peter Grassberger},
  title =	 {Data Compression and Entropy Estimates by
                  Non-sequential Recursive Pair Substitution },
  howpublished = {E-print physics/0207023},
  month =	 {July},
  year =	 {2002},
  url =		 {http://xxx.lanl.gov/abs/physics/0207023},
  pdf =		 {entropy/grassberger_02.pdf},
  entered_on =	 {10/08/03},
  abstract =	 {We argue that Non-sequential Recursive Pair
                  Substitution (NSRPS) as suggested by
                  Jim\'enez-Monta\~no and Ebeling can indeed be used
                  as a basis for an optimal data compression
                  algorithm. In particular, we prove for Markov
                  sequences that NSRPS together with suitable codings
                  of the substitutions and of the substitute series
                  does not lead to a code length increase, in the
                  limit of infinite sequence length. When applied to
                  written English, NSRPS gives entropy estimates which
                  are very close to those obtained by other
                  methods. Using ca. 135 GB of input data from the
                  project Gutenberg, we estimate the effective entropy
                  to be $\approx 1.82$ bit/character. Extrapolating to
                  infinitely long input, the true value of the entropy
                  is estimated as $\approx 0.8$ bit/character. },
  comments =	 {The paper studies the compression algorithm
                  introduced in \cite{jimenezmontano-etal-02} --
                  replace the most frequent pair of symbols with a new
                  symbol. It corrects the algorithms so that the
                  coding scheme now becomes optimal and uniquely
                  decodable. The paper proves that if a seuqence is
                  generated by a depth 1 Markov process, then the
                  substituted sequence is also depth 1 Markovian. Thus
                  block entropies of any length completely determined
                  by pair entropies. It is suggested that such
                  substitution scheme may actually drive the sequence
                  to Markovianity as a fixed point for many successive
                  substitutions. No proof of this is give, though some
                  empirical results are shown. The results are
                  curious: rank ordered plots of occurencies of new
                  compound symbols seem to tend to the same scaling
                  limit for many sequences as diverse as random ones
                  or English text. I am puzzled regarding some results
                  for the written English entropy estimation. Some
                  particular scaling law for per-symbol entropy as a
                  function of the number of substitutions is proposed,
                  but I have no guesses as to why it happens. It's
                  probably an artifact of the large subextensive
                  component in the language, but it as well can be
                  some artifact of the undersampling (though the
                  author claims that he has enough data). With the
                  amount of detail given in the paper, it is
                  impossible to check. Grassberger extrapolates per
                  symbol entropy to be 0.7+-0.2 bits. A much lower
                  value than what everyone else has.}
}

@Proceedings{NIPS-96,
  title =	 {Advances in Neural Information Processing Systems},
  year =	 {1996},
  editor =	 {UNKNOWN},
  volume =	 {8},
  address =	 {Denver, CO},
  publisher =	 {MIT Press, Cambridge, MA},
}

@article{ahmad-lin-76,
  author =	 "I. Ahmad and P. Lin",
  title =	 "A Nonparametric Estimation of the Entropy for
                  Absolutely Continuous Distributions",
  journal =	 "IEEE Transformations of Information Theory",
  volume =	 "22",
  year =	 "1976",
  pages =	 "372--350",
  entered_on =	 {08/21/2001},
  keywords =	 {TO_GET},
  comments =	 {The circumstantial evidence shows that this paper
                  deals with estimating entropies of smooth continuous
                  distributions by means of kernel density estimation
                  with Laplace (exponential) kernel. The width of the
                  kernel should fall off as a power law in the number
                  of samples. The entropy estimate is then the
                  empirical average over the data points of the
                  logarithm of the estimate distribution. See also
                  \cite{eggermont-lariccia-99}, \cite{levit-78},
                  \cite{tsybakov-vandermeulen-94}.}
}


@article{anishchenko-ebeling-neiman-94,
  author =	 {V. S. Anishchenko and W. Ebeling and A. B. Neiman},
  title =	 {Power law distributions of spectral sensity and
                  higher order entropy},
  journal =	 {Chaos, Solitons and Fractals},
  volume =	 4,
  pages =	 {69--81},
  year =	 1994,
  keywords =	 {TO_GET},
}

@inproceedings{antos-kontoyiannis-01,
  author =	 {A. Antos and I. Kontoyiannis},
  title =	 {Estimating the entropy of discrete distributions},
  year =	 {2001},
  booktitle =	 {IEEE International Symposium on Information Theory},
  entered_on =	 {08/11/2001},
  url =		 {entropy/antos_kontoyiannis_01.pdf},
  comments =	 {This is a preliminary version of
                  \cite{antos-kontoyiannis-02}.}
}

@Article{antos-kontoyiannis-02,
  author =	 {Andras Antos and Ioannis Kontoyiannis},
  title =	 {Convergence Properties of Functional Estimates for
                  Discrete Distributions} ,
  abstract =	 {Suppose P is an arbitrary discrete distribution on a
                  countable alphabet X . Given an i.i.d. sample (X1
                  ... Xn) drawn from P, we consider the problem of
                  estimating the entropy H(P) or some other functional
                  F = F(P) of the unknown distribution P. We show
                  that, for additive functionals satisfying mild
                  conditions (including the cases of the mean, the
                  entropy, and mutual information), the plug-in
                  estimates of F are universally consistent. We also
                  prove that, without further assumptions, no
                  rate-of-convergence results can be obtained for any
                  sequence of estimates. In the case of entropy
                  estimation, under a variety of different
                  assumptions, we get rate-of-convergence results for
                  the plug-in estimate and for a nonparametric
                  estimate based on match-lengths. The behavior of the
                  variance and the expected error of the plug-in
                  estimate is shown to be in sharp contrast to the
                  finite-alphabet case. A number of other important
                  examples of functionals are also treated in some
                  detail.},
  keywords =	 {Functional estimation; entropy estimation; rates of
                  convergence, match lengths.},
  journal =	 {Random Structures \& Algorithms},
  year =	 2001,
  volume =	 19,
  pages =	 {163-193},
  entered_on =	 {08/11/2001},
  url =		 {entropy/antos_kontoyiannis_02.pdf},
  comments =	 {For countable (but infinite) alphabets considered,
                  the plug-in and the matched string estimates of
                  entropy are still consistent, but now the rate of
                  convergence to the actual value cannot be
                  estimated. Even for some very strong conditions, the
                  rate is only logarithmic in the number of
                  observations (for matched strings estimator), or a
                  power law with a small power (for the plug in
                  estimator), instead of the usual inverse square root
                  for the finite alphabets. This means that we have to
                  study the method of \cite{nsb} again and check the
                  dependence of the entropy estimate on the (possibly
                  excessive) number of bins.}
}

@article{barron-gyorfi-vandermeulen-92,
  author =	 {A.R. Barron and L. Gyorfi and E.C. van der Meulen},
  title =	 {Distribution estimation consistent in total
                  variation and in two types of information
                  divergence},
  journal =	 {IEEE Trans. Inform. Theory},
  volume =	 35,
  pages =	 {1437--1454},
  month =	 {Sept.},
  year =	 1992,
  entered_on =	 {08/20/2001},
  comments =	 {TO_GET. Should be about continuous densities.}
}

@article{beirlant-etal-97,
  author =	 "J. Beirlant and E. Dudewicz and L. Gyorfi and E. van
                  der Meulen",
  title =	 "Nonparametric entropy estimation: An overview",
  journal =	 "Int. J. Math. Stat. Sci.",
  volume =	 "6",
  number =	 "1",
  pages =	 "17--39",
  year =	 "1997",
  entered_on =	 "07/18/2001",
  pdf =		 "entropy/beirlant_etal_97.pdf",
  abstract =	 {An overview is given of the several methods in use
                  for the nonparametric estimation of the differential
                  entropy of a continuous random variable. The
                  properties of various methods are compared. Several
                  applications are given such as tests for
                  goodness-of-fit, parameter estimation, quantization
                  theory and spectral estimation},
  comments =	 {The paper provides an up-to-date review of methods
                  (or lack of them) for estimating entropies of
                  continuous distributions (the emphasis is made on
                  1-d distributions). The methods cluster in two
                  parts: first, there are resampling-like
                  methods. Second, there are methods that estimate
                  entropy based on quasi-estimation of the
                  distributions (through distance between the nearby
                  samples). In the first case, the error of the
                  estimate is questionable; in the second case we have
                  to remember that the average Entropy is not the same
                  as the entropy of the average distribution. Finally,
                  the paper gives some examples of where the reviewed
                  methods can be used.}
}

@article{bercher-vignat-00,
  title =	 {Estimating the entropy of a signal with
                  applications},
  author =	 {J.-F. Bercher and C. Vignat},
  journal =	 {Signal Processing, IEEE Transactions on},
  pages =	 {1687--1694},
  month =	 {June},
  year =	 2000,
  volume =	 48,
  number =	 6,
  abstract =	 {We present a new estimator of the entropy of
                  continuous signals. We model the unknown probability
                  density of data in the form of an AR spectrum
                  density and use regularized long-AR models to
                  identify the AR parameters. We then derive both an
                  analytical expression and a practical procedure for
                  estimating the entropy from sample data. We indicate
                  how to incorporate recursive and adaptive features
                  in the procedure. We evaluate and compare the new
                  estimator with other estimators based on histograms,
                  kernel density models, and order
                  statistics. Finally, we give several examples of
                  applications. An adaptive version of our entropy
                  estimator is applied to detection of law changes,
                  blind deconvolution, and source separation.},
  keywords =	 {adaptive estimation; signal entropy estimation;
                  continuous signals; probability density; AR spectrum
                  density; regularized long-AR models; AR parameters
                  identification; sample data; recursive estimator;
                  histograms; kernel density models; order statistics;
                  adaptive entropy estimator; signal detection; blind
                  deconvolution; source separation; blind
                  equalization; signal processing},
  entered_on =	 {08/20/2001},
  url =		 {entropy/bercher_vignat_00.pdf},
  comments =	 {The paper uses the method of
                  \cite{kitagawa-gersh-85} and
                  \cite{giovanelli-demoment-herment-96} to estimate
                  entropies of continuous probability
                  distributions. These methods are based on
                  approximating a pdf by a spectrum of some
                  autoregressive process with ever increasing number
                  of Fourier modes. The amplitudes of the modes are
                  obtained by regularized least squares method, which
                  is equivalent to a saddle point solution to some
                  Bayesian theory with smoothness constraints. The
                  authors present a formula, derived in the above
                  cited literature, that estimates the hyper-parameter
                  of the Bayesian model, which is an analog of the
                  smoothness scale (not smoothness exponent) selection
                  in terms of \cite{nemenman-bialek-01}. Later the
                  authors estimate the entropy of the pdf as the
                  entropy of the estimator, which we know to be not
                  the best strategy. Then they derive an approximation
                  to this estimate that does not involve finding the
                  pdf as an intermediate step and may thus turn out to
                  be better then the value it approximates (this
                  deserves further thinking). Finally, authors use
                  this approximation for numerical tests. It looks
                  like, even though the smoothness parameters and the
                  order of the model may be determined
                  self-consistently, the authors either do not do
                  this, or do not explain that they do this.},
}

@inproceedings{bercher-vignat-99,
  title =	 {Estimating the entropy of a signal with
                  applications},
  author =	 {J.-F. Bercher and C. Vignat},
  booktitle =	 {Acoustics, Speech, and Signal Processing,
                  1999. Proceedings. IEEE International Conference on},
  pages =	 {1705--1708},
  volume =	 3,
  date =	 {15-19 March 1999},
  year =	 1999,
  abstract =	 {We present an estimator of the entropy of a
                  signal. The basic idea is to adopt a model of the
                  probability law, in the form of an AR
                  spectrum. Then, the law parameters can be estimated
                  from the data. We examine the statistical behavior
                  of our estimates of laws and entropy. Finally, we
                  give several examples of applications: an adaptive
                  version of our entropy estimator is applied to
                  detection of law changes, blind deconvolution and
                  sources separation.},
  keywords =	 {entropy; signal; probability law; AR spectrum; law
                  parameter; statistical behavior; adaptive entropy
                  estimator; changes detection; blind deconvolution;
                  sources separation},
  entered_on =	 {08/20/2001},
  comments =	 {Conference proceedings version of
                  \cite{bercher-vignat-00}.},
  url =		 {entropy/bercher_vignat_99.pdf},
}

@unpublished{borwein-lewis-94,
  journal =	 "Journal of Mathematical Analysis and Applications",
  volume =	 185,
  year =	 1994,
  pages =	 {596-604},
  author =	 "P. Borwein and A. Lewis",
  title =	 "Moment-matching and best entropy estimation",
  note =	 "J. of Mathematical Analysis and Applications",
  comments =	 "TO_GET",
}

@article{correa-95,
  author =	 {J. C. Correa},
  title =	 {A new estimator of entropy},
  journal =	 {Commun. Stat. Theory Methodol.},
  volume =	 24,
  pages =	 {2439--2449},
  year =	 1995,
  comments =	 {TO_GET. Should be about continuous densities.},
}

@article{darbellay-vajda-99,
  journal =	 {IEEE Trans. Inf. Thy.},
  volume =	 45,
  number =	 4,
  month =	 {May},
  year =	 {1999},
  pages =	 {1315--1321},
  title =	 {Estimation of the Information by an Adaptive
                  Partitioning of the Observation Space},
  author =	 {Georges A. Darbellay and Igor Vajda},
  abstract =	 {We demonstrate that it is possible to approximate
                  the mutual information arbitrarily closely in
                  probability by calculating relative frequencies on
                  appropriate partitions and achieving conditional
                  independence on the rectangles of which the
                  partitions are made. Empirical results, including a
                  comparison with maximum-likelihood estimators, are
                  presented.},
  keywords =	 {Data-dependent partitions, maximum-likelihood
                  estimation, mutual information, nonparametric
                  estimation, discretization, binning},
  url =
                  {http://citeseer.nj.nec.com/darbellay99estimation.html},
  entered_on =	 {08/19/2001},
  comments =	 {If the above url is unavailable, download the paper
                  <a
                  href="entropy/darbellay_vajda_99.pdf">here</a>. The
                  paper deals with nonparametric mutual information
                  estimation between two continuous one-dimensional
                  variables (it is possible to extend this to higher
                  dimensions). The method amounts to plug-in mutual
                  information estimation on some non-trivial
                  quantization of the continuous data space. In short,
                  one starts with one big bin and then cuts it into
                  pieces if (a) there is data in the bin, and (b) data
                  seems not to marginalize. It is not clear what
                  criterion the authors want to use to determine the
                  parameters of the algorithm (the way the bins get
                  cut, the threshold for marginalization, etc.) One
                  probably has to think a bit more about what prior is
                  implicitly assumed by the iteration stopping
                  condition; apparently, since in all the numerics the
                  authors show the mutual information is
                  underestimated, this implicit regularizing prior is
                  too smooth.},
}

@article{ebeling-93,
  author =	 {W. Ebeling},
  title =	 {Entropy and information in processes of
                  self-organization: uncertainty and predictability},
  journal =	 {Physica A},
  volume =	 194,
  number =	 {1--4},
  pages =	 {563--575},
  month =	 {May},
  year =	 1993,
  abstract =	 {The mean uncertainties of probability distributions
                  of discrete sets of events and those of dynamic
                  sequences of events are investigated on the
                  macroscopic and on the microscopic level. First
                  one-time distributions are studied. The key point is
                  the specification of the state space; the
                  (coarse-grained) physical phase space leads to the
                  thermodynamical entropy and the order parameter
                  space to the information entropy. In a similar way
                  the uncertainty of the state after one step ahead in
                  time, introduced by Shannon, McMillan and Khinchin,
                  is related to the dynamic entropies studied by
                  Kolmogorov and Sinai and to the gain of
                  information. The non-equilibrium entropies and their
                  relation to properties of attractors are discussed
                  for several examples taken from physical and
                  non-physical self-organization processes.},
  keywords =	 {TO_GET}
}

@article{ebeling-97,
  author =	 {W. Ebeling},
  title =	 {Prediction and entropy of nonlinear dynamical
                  systems and symbolic sequences with LRO},
  journal =	 {Physica D},
  volume =	 109,
  year =	 1997,
  pages =	 {42--52},
  entered_on =	 {08/15/2001},
  keywords =	 {shannon entropy, symbolic dynamics},
  url =		 {entropy/ebeling_97.pdf},
  abstract =	 {Following Shannon we introduce higher order
                  entropies and derive dynamic entropies. The n'th
                  order dynamic entropy (conditional entropy) is a
                  measure of the uncertainty of the next state which
                  follows after the observation of n foregoing
                  states. The asymptotic behaviour of the dynamic
                  entropies at large n is studied for several
                  nonlinear model systems and for symbolic sequences
                  with long-range order (LRO). For example we
                  investigate 1D-maps, texts, DNA-strings and time
                  series. It is shown that the existence of long
                  correlations improves the possibility of
                  predictions. Characteristics scaling laws for the
                  higher order Shannon entropies and the conditional
                  entropies are derived and a new interpolation
                  formula is tested. Finally instead of the dynamic
                  entropies which yield mean values of the
                  uncertainty/predictability we investigate the local
                  values of the uncertainty/predictability.},
  comments =	 {See also \cite{ebeling-frommel-98}. The paper deals
                  with power law decays of the condition entropies of
                  N-grams: S(N)-S(N-1). The cases of interest are
                  assumed to be those when this quantity
                  asymptotically falls off as a power law (with
                  exponent greater, equal, or less than 1). Fibonacci
                  sequence is shown to fall in the class of 1/N
                  falloff (here the authors miss the point that the
                  ensemble of Fibonacci sequences is just one
                  sequence, so it makes not much sense to talk about
                  its entropy). Similarly, logistic map at Feigenbaum
                  point is in the same class with 4/3N falloff (note
                  the exact prefactor; this result seems to be carried
                  over from \cite{ebeling-nicolis-91},
                  \cite{ebeling-nicolis-92}, \cite{ebeling-etal-96},
                  \cite{rateitschak-etal-95}). For actual numerics
                  (texts, DNA, weather data) the authors use the
                  method from \cite{poschel-etal-95} to estimate
                  entropies. They claim that the best fit for the
                  entropy behavior for most cases is constant + power
                  law falloff + exponential falloff. In some cases
                  they observe ``phase transitions'' -- sudden changes
                  in the constant term of the entropy. Dominating
                  square root term for text entropies is
                  stated. Source entropy for texts is suggested to be
                  0.64 bits/character. DNA with repeating sequences
                  removed is said to have no power law
                  correlations. The most interesting part of the paper
                  is it's investigation of local predictability, as
                  measured by conditional entropy of the next letter
                  given the context. This allows to discover real
                  words in texts.},
}

@incollection{ebeling-etal-01,
  author =	 {W. Ebeling and L. Molgedey and J. Kurths and
                  U. Schwarz},
  title =	 {Entropy, complexity, predictability and data
                  analysis of time series and letter sequences},
  editor =	 {A. Bunde and J. Kropp and H.-J. Schellnhuber},
  booktitle =	 {The Science of Disaster: Climate Disruptions, Heart
                  Attacks, and Market Crashes},
  publisher =	 {Springer: Berlin},
  year =	 2001,
  postscript =	 {entropy/ebeling_etal_01.ps.gz},
  abstract =	 {The structure of time series and letter sequences is
                  investigated using the concepts of entropy and
                  complexity. First conditional entropy and
                  transinformation are introduced and several
                  generalizations are discussed. Further several
                  measures of complexity are introduced and
                  discussed. The capability of these concepts to
                  describe the structure of time series and letter
                  sequences generated by nonlinear maps, data series
                  from meteorology, astrophysics, cardiology,
                  cognitive psychology and finance is
                  investigated. The relation between the complexity
                  and the predictability of informational strings is
                  discussed. The relation between local order and the
                  predictability of time series is investigated. },
  comments =	 {This is a non-technical summary of work by Ebeling
                  et al. It introduces entropies, long range
                  correlations, discusses various definitions of
                  complexities of signals, and summarizes calculations
                  of this quantities for biological sequences, texts,
                  economical data, Solar flares data, heart-beat data,
                  etc.}
}

@Article{ebeling-frommel-98,
  author =	 {Werner Ebeling and Cornelius Frommel},
  title =	 {Entropy and predictability of information carriers},
  journal =	 {BioSystems},
  year =	 {1998},
  volume =	 {46},
  pages =	 {47--55},
  entered_on =	 {08/16/2001},
  pdf =		 {entropy/ebeling_frommel_98.pdf},
  abstract =	 {The structure of linear strings carrying information
                  is investigated by means of entropy concepts. First
                  conditional entropy and transinformation are
                  introduced and several generalizations are
                  discussed. The capability to describe the structure
                  of information carriers as DNA, proteins, texts and
                  musical strings is investigated. The relation
                  between order and the predictability of
                  informational strings is discussed. As examples we
                  study the mutual information function of virus DNA
                  and several long proteins. Further we show some
                  (rather formal) analogies to the structure of texts,
                  and strings generated by musical melodies. It is
                  shown that several information carriers show
                  long-range correlations.},
  comments =	 {See also \cite{ebeling-97}, of which this paper is
                  mainly a repeat. The current paper adds looking into
                  predictability (local and global) of protein
                  sequences and musical notes, which is measure by
                  mutual information (they call it transinformation)
                  between symbols some distance apart.},
}

@Article{ebeling-nicolis-91,
  author =	 "W. Ebeling and G. Nicolis",
  title =	 "Entropy of Symbolic Sequences: The Role of
                  Correlations",
  journal =	 "Europhysics Letters",
  volume =	 14,
  number =	 3,
  pages =	 {191--196},
  year =	 "1991",
  entered_on =	 {08/15/2001},
  abstract =	 {The role of correlations in the a priori probability
                  of occurrence of symbolic sequences is analyzed, on
                  the basic of the scaling behaviors of the entropy
                  as a function of the sequence length. It is shown
                  that sporadic systems give rise to peculiar scaling
                  properties as a result of long-range
                  correlations. The potential implications of this
                  possibility in the structure of natural languages
                  are explored.},
  comments =	 {This seems to be the first paper in the Ebeling et
                  al. series on entropy of sequences. Among other
                  things, a derivation of the subextensive entropic
                  correction for the logistic map should be in this
                  article according to \cite{ebeling-97}. I have not
                  read this paper. This paper should be similar (and
                  should probably be quoted together with)
                  \cite{ebeling-nicolis-92}. TO_GET}
}

@article{eggermont-lariccia-99,
  journal =	 {IEEE Trans. Inf. Thy.},
  volume =	 45,
  number =	 4,
  month =	 {May},
  year =	 1999,
  pages =	 {1321--1326},
  title =	 {Best Asymptotic Normality of the Kernel Density
                  Entropy Estimator for Smooth Densities},
  author =	 {Paul P. B. Eggermont and Vincent N. LaRiccia},
  entered_on =	 {08/21/2001},
  pdf =		 {entropy/eggermont_lariccia_99.pdf},
  abstract =	 {In the random sampling setting we estimate the
                  entropy of a probability density distribution by the
                  entropy of a kernel density estimator using the
                  double exponential kernel. Under mild smoothness and
                  moment conditions we show that the entropy of the
                  kernel density estimator equals a sum of independent
                  and identically distributed (i.i.d.) random
                  variables plus a perturbation which is
                  asymptotically negligible compared to the parametric
                  rate n^(-1/2). An essential part in the proof is
                  obtained by exhibiting almost sure bounds for the
                  Kullback Leibler divergence between the kernel
                  density estimator and its expected value. The basic
                  technical tools are Doob s submartingale inequality
                  and convexity (Jensen s inequality).},
  keywords =	 {Convexity, entropy estimation, kernel density
                  estimators, Kullback Leibler divergence,
                  submartingales},
  comments =	 {The paper proposes to estimate entropies of
                  continuous pdfs by first estimating the pdf with
                  Laplace (exponential) kernels, and then calculating
                  the entropy of the estimate. Note, that they do not
                  take the empirical average of the log(estimate), but
                  rather calculate the P\logP integral for the
                  estimate. The paper further assumes that the moment
                  of order $k>2$ of the pdf is finite, and that the
                  integrals of (roughly) squares of first and second
                  derivatives of the log of the pdf are finite
                  too. The main conclusion of the paper is that then
                  the entropy estimator converges to the logarithm of
                  the actual pdf averaged over the sample points,
                  which is neither the empirical average, nor the
                  actual entropy, but something in between. It's
                  questionable whether this convergence means
                  convergence of the estimator to the entropy. For the
                  convergence to take place, the width of the kernel
                  functions should fall of with the number of data
                  points at least as fast as $n^{-1/4}$ (if $k=2$), or
                  even faster, if $k>2$. The rate of this convergence
                  starts with $n^{-1/2}$ for $k=2$ and becomes even
                  better for increasing $k$. This is equivalent to the
                  counting statistics and hints that under assumed
                  conditions entropy may be estimated as a ``single
                  number'' without estimating the pdf first, which
                  usually gives worse convergence. Note that in terms
                  of \cite{nemenman-bialek-01} smoothness conditions
                  in this paper correspond to $\eta_a=2$, thus the
                  effective bin size should fall off as $n^-{1/4}$;
                  this is in agreement with current paper for
                  $k=2$. Better convergence for larger $k$ probably
                  means that distributions with more limited support
                  have a smaller number of effective degrees of
                  freedom. },
}

@InProceedings{farach-etal-95,
  author =	 {Martin Farach and Michiel Nordiwier and Serap Savari
                  and Larry Shepp and Abraham Wyner and Jacob Ziv},
  title =	 {On the entropy of DNA: Algorithms and measurements
                  based on memory and rapid convergence},
  booktitle =	 {Symposium on Discrete Algorithms (SODA)},
  year =	 1995,
  entered_on =	 {08/11/2001},
  url =		 {entropy/farach_etal_95.ps.gz},
  abstract =	 {We have applied the information-theoretic notion of
                  entropy to characterize DNA sequences. We consider a
                  genetic sequence signal that is too small for
                  asymptotic entropy estimates to be accurate, and for
                  which similar approaches have previously failed. We
                  prove that the match length entropy estimator has a
                  relatively fast convergence rate and demonstrate
                  experimentally that by using this entropy estimator,
                  we can indeed extract a meaningful signal from
                  segments of DNA. Further, we derive a method for
                  detecting certain signals within DNA -- known as
                  splice junctions -- with significantly better
                  performance than previous methods. The main result
                  of the paper is that we find that the entropy of the
                  genetic material which is ultimately expressed in
                  protein sequences is higher than that which is
                  discarded. This is an unexpected result, since
                  current biological theory holds that the discarded
                  sequences (introns) are capable are capable of
                  tolerating random changes to a greater degree than
                  the retained sequences (exons).},
  comments =	 {According to this paper, the statistical error of
                  the matching estimator drops inversely proportional
                  to the square root of the number of data
                  points. However, the systematic error will fall off
                  only inversely proportionally to the logarithm of
                  the same. This is due to the fact that systematic
                  error is governed by the memory effects, and memory
                  is tested only up to the amount of observed
                  data. See also \cite{wyner-ziv-wyner-98}},
}

@Misc{gabrielli-etal-03,
  author =	 {D. Gabrielli and A. Galves and D. Guiol },
  title =	 {Fluctuations of the Empirical Entropies of a Chain
                  of Infinite Order},
  howpublished = {E-print},
  month =	 {Aug},
  year =	 {2003},
  url =		 {http://arxiv.org/abs/cond-mat/0308508},
  pdf =		 {entropy/gabrielli-03.pdf},
  entered_on =	 {08/27/2003},
  abstract =	 {This paper addresses the question of the
                  fluctuations of the empirical entropy of a chain of
                  infinite order. We assume that the chain takes
                  values on a finite alphabet and loses memory
                  exponentially fast. We consider two possible
                  definitions for the empirical entropy, both based on
                  the empirical distribution of cylinders with length
                  clog(n), where n is the size of the sample and c is
                  a suitable constant. The first one is the
                  conditional entropy of the empirical distribution,
                  given a past with length growing logarithmically
                  with the size of the sample. The second one is the
                  rescaled entropy of the empirical distribution of
                  the cylinders of size growing logarithmically with
                  the size of the sample. We prove a central limit
                  theorem for the first one. We also prove that the
                  second one does not have Gaussian fluctuations. This
                  solves a problem formulated in Iosifescu (1965).},
  comments =	 {The paper deals with properties of empirical
                  estimators of conditional entropies and
                  block-entropies. First, the paper limits itself to
                  the stochastic processes with exponentially decaying
                  memory. In the notation of \cite{bnt-01}, it means
                  that the subextensive entropy $S_1$ is bounded from
                  above for any block length. Other important
                  assumptions include: the measure for any sequence in
                  nonzero, and limiting per letter block entropy and
                  the limiting conditinal entropy are the
                  same. Further, this is a purely asymptotic paper,
                  whith the block length ($K$) manifestly dominated my
                  the sample size ($N$). The Gaussian behavior of the
                  empirical conditional entropy (Theorem 2.1), for
                  example, is proven to have standard deviation of
                  $1/\sqrt{N}$, as should be expected. This says
                  nothing about the famous Miller bias correction of
                  $\sim K/N$ for the mean of the said Gaussian
                  distribution (note also that the requirement of
                  nonzero measure for any sequence is very important
                  for this theorem, as the pefactor for $1/\sqrt{N}$
                  is the average delayed second order correlation
                  function of the log-probability, which may be
                  unbounded for zero probability). The Theorem 2.2
                  basically says that the subextensive part of the
                  block entropy, which is bounded for the assumptions
                  used, will be empirically estimated as that same
                  constant, but the fluctuations are not
                  normal. Again, no bias corrections are analyzed. In
                  summary, this paper probably has very little
                  relation to practical questions about estimation of
                  entropy from small samples.},
}

@misc{grassberger-03,
  url =		 {http://www.arxiv.org/abs/physics/0307138},
  title =	 {Entropy Estimates from Insufficient samples},
  author =	 {Peter Grassberger},
  entered_on =	 {09/13/03},
  howpublished = {E-print},
  pdf =		 {entropy/grassberger_03.pdf},
  month =	 {July},
  year =	 2003,
  abstract =	 {We present a detailed derivation of some estimators
                  of Shannon entropy for discrete distributions. They
                  hold for finite samples of N points distributed into
                  M "boxes", with N and M -> oo, but N/M < oo. In the
                  high sampling regime (<< 1 points in each box) they
                  have exponentially small biases. In the low sampling
                  regime the errors increase but are still much
                  smaller than for most other estimators. One
                  advantage is that our main estimators are given
                  analytically, with explicitly known analytical
                  formulas for the biases.},
  comments =	 {The paper is a conitnuation of
                  \cite{grassberger-88}, derives (corrected) bias for
                  estimators introduced there, and introduces a couple
                  of yet better estimators. The idea behind the method
                  is that each bin should contribute to entropy
                  independent of all other bins. So for unbiased
                  entropy estimation one just needs to find an
                  unbiased estimator of $p\log p$ (or rather $z \log
                  z$, $z=Np$, where $N$ is the number of
                  samples). This has an obvious problem of inter-bin
                  dependence through normalization, but this is easy
                  to correct and, in any case, not very important for
                  $M$, the number of bins, being large. In my view,
                  two other problems may be more serious: (1) bins
                  with zero counts, which may contribute a lot to
                  entropy, are neglected, (2) one can use some
                  statistics (say, rank ordered plot form) of other
                  bins to understand which family the studied
                  distribution belongs to, and then contribution of a
                  given bin to the total entropy may be made dependent
                  on the family. Further the paper assumes that the
                  probability in each of the bin is <<1, but this is
                  just a technical assumption. The paper starts with a
                  simple function, eq 13, which is supposed to give an
                  unbiased (modulo above asumptions) estimate of
                  $\exp(Renyi entropy of order>=2)$ (is the
                  observation new?). I would disagree with the
                  "unbiased" claim: again, it is possible that a bin
                  will have $n_i=0$, which is thrown away in the
                  averaging. From there the author develops a series
                  of functions of $n_i$, which, if averaged over
                  $n_i>0$ give smaller and smaller bias in
                  approximating $z\log z$, at a cost of larger
                  variance (the trade-off is quite understandable!)
                  The mathematics is beuatiful. All of the developed
                  estimators are guaranteed to have negative bias in
                  entropy, but smaller than that of Miller. Another
                  part of the negative bias comes from neglected
                  bins. Further, the value of bias is exponentially
                  small in $N/M$. These last two observations tell us
                  that the method is an <em>asymptotic</em> method,
                  which would be unapplicable if $N/M<1$, the case
                  discussed in \cite{nsb} and \cite{nsb2}. Grassberger
                  also suggests an estimator, which is derived by
                  explicitely requiring that expectation of the
                  estimator (again, over $n>0$) is close to $z\log z$
                  in some arbitrary metric and measure over $z$, eq
                  36. This is at odds with the otherwise non-Bayesian
                  approach because the solution to such minimization
                  problem strongly depends on the measure assumed on
                  the space of z, that is, on the prior. Further, as
                  Grassberger himself notices, estimators obtained
                  this way are non-monotonic in $n$, so that having
                  more samples in a bin may contribute less to
                  entropy. This may be (technically, but not
                  conceptually) nice for overcoming the problems of
                  not using the $n=0$ bins since those will be
                  accounted for at $n=1$ level (again, should be
                  sensitive to the measure). Grassberger compares
                  performance of his estimators to \cite{nsb} on the
                  same examples as we used in that paper. He is
                  sketchy about the comparison; in particular I don't
                  understand the comment that only in a few cases he
                  had some of the boxes being empty (and this is for
                  $M>1000$, $N<300$). But his conclusions seem
                  reasonable -- his estimators appear to work on par
                  with (and some times better than) \cite{nsb} if
                  there are not too many bins with zero counts and
                  nonzero probability mass, or with counts of ~1. In
                  summary, nice paper. I need to further investigate
                  some of Grassberger's points about a possibility of
                  building an unbiased entropy estimate at an expense
                  of a huge variance. I suspect that, even allowing
                  for an arbitrary high variance, one will still get
                  zero bias only for some prior over allowed
                  distributions (say, limiting the number of bins).}
}

@article{grassberger-88,
  author =	 {Peter Grassberger},
  title =	 {Finite sample corrections to entropy and dimension
                  estimates},
  journal =	 {Phys. Lett. A},
  volume =	 128,
  pages =	 {369--373},
  year =	 1988,
  entered_on =	 {08/18/2001, 06/11/2003},
  abstract =	 {We derive the systematic corrections to estimates of
                  generalized (Renyi) entropies and to generalized
                  dimensions Dq from finite data sets. As an
                  application, we discuss correlation estimates of Dq
                  for the Henon map. We end with some remarks about
                  lacunarity measures. },
  comments =	 {Another way to calculate the downwards bias in the
                  entropy estimation for some particular class of
                  probability distributions is presented. See
                  \cite{schurmann-grassberger-96}. The attempt is to
                  use only observed quantities in the bias
                  calculations.}
}

@article{grassberger-89,
  author =	 {Peter Grassberger},
  title =	 {Estimating the information content of symbol
                  sequences and efficient codes},
  journal =	 {IEEE Trans. Inf. Thy.},
  volume =	 35,
  year =	 1989,
  pages =	 {669 --675},
  entered_on =	 {08/16/2001},
  url =		 {entropy/grassberger_89.pdf},
  abstract =	 {Several variants of an algorithms for estimating
                  Shannon entropies of symbol sequences are
                  presented. They are all related to the Lempel-Ziv
                  algorithm and to recent algorithms for estimating
                  Hausdorff dimensions. The average storage and
                  running times increase as N and N log N,
                  respectively, with the sequence length N. These
                  algorithms proceed basically by constructing
                  efficient codes. They seem to be the optimal
                  algorithms for sequences with strong long-range
                  correlations, e.g., natural languages. An
                  application to written English illustrates their
                  use.},
  comments =	 {This seems to be one of the first and very
                  influential papers dealing with the symbolic
                  sequences entropy estimation via Lempel-Ziv
                  \cite{lempel-ziv-76}, \cite{lempel-ziv-77} type of
                  universal compression schemes. The essence of the
                  algorithm is construction of non-repeated prefix
                  trees and analysis of growth of their depth (length
                  of non-repeated prefixes) as a function of the
                  amount of data observed. It is conjectured that the
                  average depth of the trees converges to logN/entropy
                  for any ergodic process. This however may not be
                  true see \cite{shields-97}, and for processes with
                  long correlations convergence the convergence is
                  definitely very slow. Grassberger himself shows that
                  convergence to the entropy rate is only logarithmic
                  in the number of observations, just like it is for
                  the LZ methods, and the bias may be corrected only
                  for simple, short-correlated processes. One should
                  keep in mind that here N is the number of data
                  points (total length of the sequence), and not the
                  size of the ``words'' being analyzed. Having N
                  samples, one can analyze only words of the length
                  ~logN (there are ~N of those) with reasonable
                  sampling, so the convergence is of order of
                  1/(longest word)^power, just like in the direct word
                  sampling methods. Grassberger's method has the
                  following advantages over LZ: approach to asymptote
                  is smoother, and there's less dependence on
                  violations of non-stationarity. To get rid of bias,
                  similarly to \cite{strong_etal-97}, Grassberger
           