
@article{Peretz:2005,
	Author = {Peretz, I. and Zatorre, R.J.},
	Doi = {10.1146/annurev.psych.56.091103.070225},
	Journal = {Annual Review of Psychology},
	Number = {1},
	Pages = {89--114},
	Publisher = {Annual Reviews},
	Title = {Brain Organization for Music Processing},
	Volume = {56},
	Year = {2005},
	Abstract = {Research on how the brain processes music is emerging as a rich and stimulating area of investigation of perception, memory, emotion, and performance. Results emanating from both lesion studies and neuroimaging techniques are reviewed and integrated for each of these musical functions. We focus our attention on the common core of musical abilities shared by musicians and nonmusicians alike. Hence, the effect of musical training on brain plasticity is examined in a separate section, after a review of the available data regarding music playing and reading skills that are typically cultivated by musicians. Finally, we address a currently debated issue regarding the putative existence of music-specific neural networks. Unfortunately, due to scarcity of research on the macrostructure of music organization and on cultural differences, the musical material under focus is at the level of the musical phrase, as typically used in Western popular music.
},
}

@article{Zatorre:2007,
	Author = {Zatorre, R. J. and Chen, J. L. and Penhune, V. B.},
	Doi = {10.1038/nrn2152},
	Journal = {Nature Reviews Neuroscience},
	Pages = {547--558},
	Title = {When the brain plays music: auditory--motor interactions in music perception and production},
	Volume = {8},
	Year = {2007},
	Abstract = {Music performance is both a natural human activity, present in all societies, and one of the most complex and demanding cognitive challenges that the human mind can undertake. Unlike most other sensory--motor activities, music performance requires precise timing of several hierarchically organized actions, as well as precise control over pitch interval production, implemented through diverse effectors according to the instrument involved. We review the cognitive neuroscience literature of both motor and auditory domains, highlighting the value of studying interactions between these systems in a musical context, and propose some ideas concerning the role of the premotor cortex in integration of higher order features of music with appropriately timed and organized actions.
},
}

@article{Ozdemir:2006,
	Author = {{\"O}zdemir, E. and Norton, A. and Schlaug, G.},
	Doi = {10.1016/j.neuroimage.2006.07.013},
	Journal = {Neuroimage},
	Number = {2},
	Pages = {628--635},
	Publisher = {Elsevier},
	Title = {Shared and distinct neural correlates of singing and speaking},
	Volume = {33},
	Year = {2006},
	Abstract = {Using a modified sparse temporal sampling fMRI technique, we examined both shared and distinct neural correlates of singing and speaking. In the experimental conditions, 10 right-handed subjects were asked to repeat intoned (``sung'') and non-intoned (``spoken'') bisyllabic words/phrases that were contrasted with conditions controlling for pitch (``humming'') and the basic motor processes associated with vocalization (``vowel production''). Areas of activation common to all tasks included the inferior pre- and post-central gyrus, superior temporal gyrus (STG), and superior temporal sulcus (STS) bilaterally, indicating a large shared network for motor preparation and execution as well as sensory feedback/control for vocal production. The speaking more than vowel-production contrast revealed activation in the inferior frontal gyrus most likely related to motor planning and preparation, in the primary sensorimotor cortex related to motor execution, and the middle and posterior STG/STS related to sensory feedback. The singing more than speaking contrast revealed additional activation in the mid-portions of the STG (more strongly on the right than left) and the most inferior and middle portions of the primary sensorimotor cortex. Our results suggest a bihemispheric network for vocal production regardless of whether the words/phrases were intoned or spoken. Furthermore, singing more than humming (``intoned speaking'') showed additional right-lateralized activation of the superior temporal gyrus, inferior central operculum, and inferior frontal gyros which may offer an explanation for the clinical observation that patients with non-fluent aphasia due to left hemisphere lesions are able to sing the text of a song while they are unable to speak the same words. 
}}

@article{Salame:1989,
	Author = {Salam{\'e}, P. and Baddeley, A.},
	Doi = {10.1080/14640748908402355},
	Journal = {The Quarterly Journal of Experimental Psychology Section A},
	Number = {1},
	Pages = {107--122},
	Publisher = {Psychology Press},
	Title = {Effects of background music on phonological short-term memory},
	Volume = {41},
	Year = {1989},
	Abstract = {Immediate memory for visually presented verbal material is disrupted by concurrent speech, even when the speech is unattended and in a foreign language. Unattended noise does not produce a reliable decrement. These results have been interpreted in terms of a phonological short-term store that excludes non-speechlike sounds. The characteristics of this exclusion process were explored by studying the effects of music on the serial recall of sequences of nine digits presented visually. Experiment 1 compared the effects of unattended vocal or instrumental music with quiet and showed that both types of music disrupted STM performance, with vocal music being more disruptive than instrumental music. Experiment 2 attempted to replicate this result using more highly trained subjects. Vocal music caused significantly more disruption than instrumental music, which was not significantly worse than the silent control condition. Experiment 3 compared instrumental music with unattended speech and with noise modulated in amplitude, the degree of modulation being the same as in speech. The results showed that the noise condition did not differ from silence; both of these proved less disruptive than instrumental music, which was in turn less disruptive than the unattended speech condition. Theoretical interpretation of these results and their potential practical implications for the disruption of cognitive performance by background music are discussed.
},
}

@article{Mandryk:2007,
	Author = {Mandryk, R. L. and Atkins, M. S.},
	Doi = {10.1016/j.ijhcs.2006.11.011},
	Journal = {International Journal of Human-Computer Studies},
	Number = {4},
	Pages = {329--347},
	Publisher = {Elsevier},
	Title = {A fuzzy physiological approach for continuously modeling emotion during interaction with play technologies},
	Volume = {65},
	Year = {2007},
	Abstract = {The popularity of computer games has exploded in recent years, yet methods of evaluating user emotional state during play experiences lag far behind. There are few methods of assessing emotional state, and even fewer methods of quantifying emotion during play. This paper presents a novel method for continuously modeling emotion using physiological data. A fuzzy logic model transformed four physiological signals into arousal and valence. A second fuzzy logic model transformed arousal and valence into five emotional states relevant to computer game play: boredom, challenge, excitement, frustration, and fun. Modeled emotions compared favorably with a manual approach, and the means were also evaluated with subjective self-reports, exhibiting the same trends as reported emotions for fun, boredom, and excitement. This approach provides a method for quantifying emotional states continuously during a play experience.},
}

@article{Barendregt:2006,
	Author = {Barendregt, W. and Bekker, M. M. and Bouwhuis, D.G. and Baauw, E.},
	Doi = {10.1016/j.ijhcs.2006.03.004 },
	Journal = {International Journal of Human-Computer Studies},
	Number = {9},
	Pages = {830--846},
	Publisher = {Elsevier},
	Title = {Identifying usability and fun problems in a computer game during first use and after some practice},
	Volume = {64},
	Year = {2006},
	Abstract = {This paper describes an experiment to discover the change in the types of detected problems and the attitude of children towards a game when user testing a computer game for young children during first use and after they have practiced with a game. Both the numbers of different types of identified problems and the severity of the problems are investigated. Based on this knowledge, practitioners could adapt the set up of their user tests to effectively find as many aspects of the game as possible that merit change, according to the aims of the developers. The study shows that usability problems caused by a lack of knowledge were more often identified during first use. Furthermore, fun problems related to a too-high challenge level may disappear after some practice, whereas fun problems caused by the game taking over control for too long while the user wants to proceed playing the game were identified more often after some practice. The study shows that the impact severity of problems detected during first use was higher than when children had more practice with a game. As a result of these changes in experienced problems the commonly used measures efficiency, effectiveness and satisfaction increased when children had practiced with the game. Finally, the study also shows that the set of most severe problems identified during first use may be radically different from the set of most severe problems identified after some practice.
},
}

@inproceedings{Buxton:1980,
	Author = {Buxton, W. and Sniderman, R.},
	Booktitle = {Proceedings of the 13th Annual Meeting, Human Factors Association of Canada},
	Pages = {72--81},
	Title = {Iteration in the design of the human-computer interface},
	Year = {1980},
	Abstract = {Issues pertaining to designing effective human-computer interfaces are discussed.  This 
presentation focuses on the special case of providing congenial computer-based tools to 
end users who are expert in their own area, but who may be technologically naive.  In 
so doing, we draw examples from a particular study of designing computer systems for 
professional musicians.  This experience brings to light many issues which have 
relevance beyond the specific application of music.  These include the importance of 
effective prototyping tools, the use of test subjects during the design process, the 
importance of developing methods of performance evaluation, and more generally, the 
value of taking an iterative approach to design. 
},
}

@inproceedings{Marshall:2006,
	Author = {Marshall, M.T. and Wanderley, M.M.},
	Booktitle = {Computer Music Modeling And Retrieval: Third International Symposium, CMMR 2005, Pisa, Italy, September 26-28, 2005, Revised Papers},
	Doi = {10.1007/11751069},
	Publisher = {Springer},
	Title = {Evaluation of Sensors as Input Devices for Computer Music Interfaces},
	Year = {2006},
	Abstract = {This paper presents ongoing research into the design and creation of interfaces for computer music. This work concentrates on the use of sensor as the primary means of interaction for computer music, and examines the relationships between types of sensors and musical functions. Experiments are described which aim to discover the particular suitability of certain sensors for specific musical tasks. The effects of additional visual feedback on the perceived suitability of these sensors is also examined. Results are given, along with a discussion of their possible implications for computer music interface design and pointers for further work on this topic.
},
}

@article{Essl:2006,
	Author = {Essl, G. and O'Modhrain, S.},
	Doi = {10.1017/S135577180600152X},
	Journal = {Organised Sound},
	Number = {03},
	Pages = {285--296},
	Publisher = {Cambridge Univ Press},
	Title = {An enactive approach to the design of new tangible musical instruments},
	Volume = {11},
	Year = {2006},
	Abstract = {In this paper, we propose a theoretical framework for the design of tangible interfaces for musical expression. The main insight for the proposed approach is the importance and utility of familiar sensorimotor experiences for the creation of engaging and playable new musical instruments. In particular, we suggest exploiting the commonalities between different natural interactions by varying the auditory response or tactile details of the instrument within certain limits. Using this principle, devices for classes of sounds such as coarse grain collision interactions or friction interactions can be designed. The designs we propose retain the familiar tactile aspect of the interaction so that the performer can take advantage of tacit knowledge gained through experiences with such phenomena in the real world.},
}

@article{Jorda:2004,
	Author = {Jord{\`a}, S.},
	Doi = {10.1080/0929821042000317886},
	Journal = {Journal of New Music Research},
	Number = {3},
	Pages = {321--341},
	Publisher = {Routledge, part of the Taylor \{\&} Francis Group},
	Title = {Instruments and Players: Some Thoughts on Digital Lutherie},
	Volume = {33},
	Year = {2004},
	Abstract = {Musical instruments are used to play and to produce music, transforming the actions of one or more performers into sound. This article explores some instrument design issues, structured into three distinct parts. The first section attempts to define what musical instruments are, how traditional instruments function and what they can do, and what future instruments could be, trying to figure out how we could better exploit their unlimited potential. The second section gives a quick review of the current know-how and the technical and conceptual frameworks in which new instrument designers and researchers are currently working. It is not an actual survey of new instruments and controllers, but more a survey of thoughts and knowledge about them. The third and last section studies the dynamic relationship that builds between the player and the instrument, introducing such concepts as efficiency, apprenticeship, and the learning curve. It explores generic properties of some musical instruments such as the diversity, variability or reproducibility of their musical output, the linearity or non-linearity of their behaviour, and tries to figure out how these aspects can bias the relationship between the instrument and the player, and how they may relate to more commonly studied concepts such as expressivity or virtuosity. The aim of this paper is the foundation of a theoretical framework in which the possibilities and the diversity of musical instruments, as well as the possibilities and expressive freedom of human performers, could all be evaluated.},
}

@article{Fels:2004,
	Author = {Fels, S.},
	Doi = {10.1109/JPROC.2004.825887},
	Journal = {Proceedings of the IEEE},
	Number = {4},
	Pages = {672--685},
	Title = {Designing for Intimacy: Creating New Interfaces for Musical Expression},
	Volume = {92},
	Year = {2004},
	Abstract = {Contemporary musical instrument design using computers provides nearly limitless potential for designing the mapping between gesture and sound. When designing effective and expressive musical instruments, the types of relationship between musician/player and his instrument and the aesthetics of the relationships must be considered. This paper discusses four types of relationships and their aesthetics. A high degree of intimacy is achieved when the relationship reaches a level where the mapping between control and sound is transparent to the player, that is, the player embodies the device. Ultimately, this type of relationship allows intent and expression to flow through the player to the sound and, hence, create music. Three new interfaces for musical expression, the Iamascope, Sound Sculpting and Tooka, provide examples of how instruments may be designed to develop and explore intimacy and embodiment of new musical instruments.
},
}

@article{Burtner:2003,
	Author = {Burtner, M.},
	Journal = {Organised Sound},
	Number = {02},
	Pages = {201--213},
	Publisher = {Cambridge Univ Press},
	Title = {The Metasaxophone: concept, implementation, and mapping strategies for a new computer music instrument},
	Volume = {7},
	Year = {2003},
	Abstract = {The Metasaxophone is an acoustic tenor saxophone retrofitted with an onboard computer microprocessor and an array of sensors that convert performance data into MIDI control messages. The instrument has additionally been outfitted with a unique microphone system that allows for detailed control of the amplified sound. While maintaining full acoustic functionality it is also a versatile MIDI controller and an electric instrument. A primary motivation behind the Metasaxophone is to put signal processing under direct expressive control of the performer. Through the combination of gestural and audio performance control, employing both discrete and continuous multilayered mapping strategies, the Metasaxophone can be adapted for a wide range of musical purposes. This paper explores the artistic and technical development of the instrument, as well as new conceptions of musical mappings arising from the enhanced interface.
},
}

@article{Poli:2004,
	Author = {Poli, G. D.},
	Doi = {10.1080/0929821042000317796},
	Journal = {Journal of New Music Research},
	Number = {3},
	Pages = {189--202},
	Publisher = {Routledge},
	Title = {Methodologies for Expressiveness Modelling of and for Music Performance},
	Url = {https://www.enactivenetwork.org/download.php?id=34},
	Volume = {33},
	Year = {2004},
	Abstract = {Expression is an important aspect of music performance. It is the added value of a performance, and is part of the reason that music is interesting to listen to and sounds alive. Moreover, understanding and modelling expressive content communication is important in many engineering applications. In human musical performance, acoustical or perceptual changes in sound are organized in a complex way by the performer in order to communicate musical content to the listener. The same piece of music can be performed trying to convey a specific interpretation of the score by adding mutable expressive intentions. The analysis of these systematic deviations has led to the formulation of several models that try to describe their structures, with the aim of explaining where, how and why a performer modifies, sometime in an unconscious way, what is indicated by the notation of the score. Modelling paradigms and problems are reviewed and issues for future research efforts are discussed.
},
}

@article{Levitin:2003,
	Author = {Levitin, D. J. and McAdams, S. and Adams, R. L.},
	Doi = {10.1017/S135577180200208X},
	Journal = {Organised Sound},
	Number = {02},
	Pages = {171--189},
	Publisher = {Cambridge Univ Press},
	Title = {Control parameters for musical instruments: a foundation for new mappings of gesture to sound},
	Url = {http://www.psych.mcgill.ca/labs/levitin/research/ControlParams.pdf},
	Volume = {7},
	Year = {2003},
	Abstract = {In this paper we describe a new way of thinking about musical tones, specifically in the context of how features of a sound might be controlled by computer musicians, and how those features might be most appropriately mapped onto musical controllers. Our approach is the consequence of one bias that we should reveal at the outset: we believe that electronically controlled (and this includes computer-controlled)musical instruments need to be emancipated from the keyboard metaphor; although piano-like keyboards are convenient and familiar, they limit the musician's expressiveness (Mathews 1991, Vertegaal and Eaglestone 1996, Paradiso 1997, Levitin and Adams 1998). This is especially true in the domain of computer music,in which timbres can be created that go far beyond the physical constraints of traditional acoustic instruments.
},
}

@inproceedings{Gurevich:2007,
	Author = {Gurevich, M. and Trevi{\~n}o, J.},
	Booktitle = {Proceedings of the 7th international conference on New interfaces for musical expression},
	Pages = {106--111},
	Publisher = {ACM Press New York, NY, USA},
	Title = {Expression and its discontents: toward an ecology of musical creation},
	Url = {http://nime.org/2007/proc/nime2007_106.pdf},
	Year = {2007},
	Abstract = {We describe the prevailing model of musical expression, which assumes a binary formulation of ``the text'' and ``the act,'' along with its implied roles of composer and performer. We argue that this model not only excludes some contemporary aesthetic values but also limits the communicative ability of new music interfaces. As an alternative, an ecology of musical creation accounts for both a diversity of aesthetic goals and the complex interrelation of human and non-human agents. An ecological perspective on several approaches to musical creation with interactive technologies reveals an expanded, more inclusive view of artistic interaction that facilitates novel, compelling ways to use technology for music. This paper is fundamentally a call to consider the role of aesthetic values in the analysis of artistic processes and technologies.},
}

@phdthesis{Janer:2008,
	Author = {Janer, J.},
	School = {Music Technology Group, Universitat Pompeu Fabra, Barcelona},
	Title = {Singing-driven interfaces for sound synthesizers},
	Url = {http://www.mtg.upf.edu/~jjaner/phd/Tesi_jjaner_online.pdf},
	Year = {2008},
	Abstract = {Together with the sound synthesis engine, the user interface, or controller, is a basic component of any digital music synthesizer and the primary focus of this dissertation. Under the title of singing-driven interfaces, we study the design of systems, that based on the singing voice as input, can control the synthesis of musical sounds.
From a number of preliminary experiments and studies, we identify the principal issues involved in voice-driven synthesis. We propose one approach for controlling a singing voice synthesizer and another one for controlling the synthesis of other musical instruments. In the former, input and output signals are of the same nature, and control to signal mappings can be direct. In the latter, mappings become more complex, depending on the phonetics of the input voice and the characteristics of the synthesized instrument sound. For this latter case, we present a study on vocal imitation of instruments showing that these voice signals consist of syllables with musical meaning. Also, we suggest linking the characteristics of voice signals to instrumental gestures, describing these signals as vocal gestures.
Within the wide scope of the voice-driven synthesis topic, this dissertation studies the relationship between the human voice and the sound of musical instruments by addressing the automatic description of the voice and the mapping strategies for a meaningful control of the synthesized sounds. The contributions of the thesis include several voice analysis methods for using the voice as a control input: a) a phonetic alignment algorithm based on dynamic programming; b) a segmentation algorithm to isolate vocal gestures; c) a formant tracking algorithm; and d) a breathiness characterization algorithm. We also propose a general framework for defining the mappings from vocal gestures to the synthesizer parameters, which are configured according to the instrumental sound being synthesized.
As a way to demonstrate the results obtained, two real-time prototypes are implemented. The first prototype controls the synthesis of a singing voice and the second one is a generic controller for other instrumental sounds.},
}

@inproceedings{Keifer:2008,
	Author = {Kiefer, C. and Collins, N. and Fitzpatrick, G.},
	Booktitle = {NIME'08},
	Keywords = {evaluation, HCI},
	Title = {{HCI} Methodology For Evaluating Musical Controllers: A Case Study },
	Url = {http://www.informatics.sussex.ac.uk/users/ck84/text/HCIMethodologyNIME08.pdf},
	Year = {2008},
	Abstract = {There is small but useful body of research concerning the evaluation of musical interfaces with HCI techniques. In this paper, we present a case study in implementing these techniques; we describe a usability experiment which evaluated the Nintendo Wiimote as a musical controller, and reflect on the effectiveness of our choice of HCI methodologies in this context. The study offered some valuable results, but our picture of the Wiimote was incomplete as we lacked data concerning the participants' instantaneous musical experience. Recent trends in HCI are leading researchers to tackle this problem of evaluating user experience; we review some of their work and suggest that with some adaptation it could provide useful new tools and methodologies for computer musicians. 
},
}

@article{Rodgers:1988,
	Author = {Rodgers, J. L. and Nicewander, W. A.},
	Journal = {The American Statistician},
	Keywords = {pearson, correlation},
	Month = {Feb},
	Number = {1},
	Pages = {59--66},
	Publisher = {JSTOR},
	Title = {Thirteen ways to look at the correlation coefficient},
	Url = {http://www.jstor.org/stable/2685263},
	Volume = {42},
	Year = {1988},
	Abstract = {In 1885, Sir Francis Galton first defined the term ``regression'' and completed the theory of bivariate correlation. A decade later, Karl Pearson developed the index that we still use to measure correlation, Pearson's r. Our article is written in recognition of the 100th anniversary of Galton's first discussion of regression and correlation. We begin with a brief history. Then we present 13 different formulas, each of which represents a different computational and conceptual definition of r. Each formula suggests a different way of thinking about this index, from algebraic, geometric, and trigonometric settings. We show that Pearson's r (or simple functions of r) may variously be thought of as a special type of mean, a special type of variance, the ratio of two means, the ratio of two variances, the slope of a line, the cosine of an angle, and the tangent to an ellipse, and may be looked at from several other interesting perspectives.
},
}

@article{Xu:2008,
	Author = {Xu, W. and Chang, C. and Hung, Y. S. and Fung, P. C. W.},
	Doi = {10.1109/TSP.2007.916127},
	Journal = {{IEEE} Transactions on Signal Processing},
	Keywords = {OSCC},
	Title = {Asymptotic properties of order statistics correlation coefficient in the normal cases},
	Year = {2008},
}

@article{Lindemann:2007,
	Author = {Lindemann, E.},
	Doi = {10.1109/MSP.2007.323267},
	Issn = {1053-5888},
	Journal = {{IEEE} Signal Processing Magazine},
	Keywords = {frequency synthesizers, harmonic analysis, musical acoustics, musical instrumentsadditive synthesis, concatenative synthesis, idiomatic instrumental phrases, music synthesis, reconstructive phrase modeling, sampling sound quality, time-varying harmonics plus noise elements},
	Month = {March},
	Number = {2},
	Pages = {80--91},
	Title = {Music Synthesis with Reconstructive Phrase Modeling},
	Volume = {24},
	Year = {2007},
	Abstract = {This article describes a new synthesis technology called reconstructive phrase modeling (RPM). A goal of RPM is to combine the realistic sound quality of sampling with the performance interaction of functional synthesis. Great importance is placed on capturing the dynamics of note transitions-slurs, legato, bow changes, etc. Expressive results are achieved with conventional keyboard controllers. Mastery of special performance techniques is not needed. RPM is an analysis-synthesis system that is related to two important trends in computer music research. The first is a form of additive synthesis in which sounds are represented as a sum of time-varying harmonics plus noise elements. RPM creates expressive performances by searching a database of idiomatic instrumental phrases and combining modified fragments of these phrases to form a new expressive performance. This approach is related to another research trend called concatenative synthesis.
},
}

@article{Goydke:2004,
	Author = {Goydke, Katja N. and Altenm¸ller, Eckart and Mˆller, J¸rn and M¸nte, Thomas F.},
	Doi = {doi:10.1016/j.cogbrainres.2004.06.009},
	Journal = {Cognitive Brain Research},
	Keywords = {Sensory systems, Auditory systems: control physiology; Music; Emotion; Brain; Tones; Deviant; Mismatch negativity; Pitch; Timbre},
	Month = {July},
	Number = {3},
	Pages = {351--359},
	Title = {Changes in emotional tone and instrumental timbre are reflected by the mismatch negativity},
	Url = {http://www.sciencedirect.com/science/article/B6SYV-4CXMYY3-2/1/81b4a18a73f2ffd93c575d37936eb3fb},
	Volume = {21},
	Year = {2004},
	Abstract = { The present study examined whether or not the brain is capable to preattentively discriminate tones differing in emotional expression or instrumental timbre. In two event-related potential (ERP) experiments single tones (600 ms) were presented which had been rated as happy or sad in a pretest. In experiment 1, 12 non-musicians passively listened to tone series comprising a frequent (standard) single musical tone played by a violin in a certain pitch and with a certain emotional connotation (happy or sad). Among these standard tones deviant tones differing in emotional valence, either in instrumental timbre or in pitch were presented. All deviants generated mismatch negativity (MMN) responses. The MMN scalp topography was similar for all of the three deviants but latency was shorter for pitch deviants than for the other two conditions. The topography of the mismatch responses was indistinguishable. In a second experiment, subjects actively detected the deviant tones by button press. All detected deviants generated P3b waves at parietal leads. These results indicate that the brain is not only able to use simple physical differences such as pitch for rapid preattentive categorization but can also perform similar operations on the basis of more complex differences between tones of the same pitch such as instrumental timbre and the subtle timbral differences associated with different emotional expression. This rapid categorization may serve as a basis for the further fine-grained analysis of musical (and other) sounds with regard to their emotional content.},
}

@inproceedings{Nordahl:2008,
	Address = {Florence},
	Author = {Nordahl, R. and Serafin, S. and Timcenko, O.},
	Booktitle = {COST-SID workshop on Sonic Interaction Design},
	Month = {April},
	Title = {Contextualisation and evaluation of novel sonic interfaces using problem based learning},
	Url = {http://trac.sme-ccppd.org/SID/browser/action/workshops/2008/Sessions/Contextualisation%20and%20evaluation%20of%20novel%20sonic%20interfaces%20using%20problem%20based%20learning.pdf?format=raw},
	Year = {2008},
	Abstract = {In this paper, we advocate the use of problem based learning 
(PBL) as a pedagogical method in educations addressing the 
design and evaluation of sonic interfaces. We introduce two 
projects as examples which adopt PBL in the design of sonic 
interactive systems, and we discuss our approach in combin- 
ing contextual and technical issues in interaction design. 

The challenge becomes therefore not how to design interac- 
tions, but what to design and especially why to design inter- 
active systems. More specifically, it is the why which leads 
to the how and the what. 
},
}

@inbook{Campoy:2005,
	Author = {Campoy, P. and Vicente, C. J.},
	Chapter = {Residual Activity in the Neurons Allows SOMs to Learn Temporal Order},
	Doi = {10.1007/11550822_59},
	Pages = {379--384},
	Publisher = {Springer},
	Series = {Lecture Notes in Computer Science},
	Title = {Artificial Neural Networks: Biological Inspirations -- ICANN 2005},
	Url = {http://www.springerlink.com/content/c0lkevaflbvnxyut},
	Volume = {3696},
	Year = {2005},
	Abstract = {A novel activity associated to the neurons of a SOM, called Residual Activity (RA), is defined in order to enlarge into the temporal domain the capabilities of a Self-Organizing Map for clustering and classifying the input data when it offers a temporal relationship. This novel activity is based on the biological plausible idea of partially retaining the activity of the neurons for future stages, that increases their probability to become the winning neuron for future stimuli. The proposed paper also proposes two quantifiable parameters for evaluating the performances of algorithms that aim to exploit temporal relationship of the input data for classification. Special designed benchmarks with spatio-temporal relationship are presented in which the proposed new algorithm, called TESOM (acronym for Time Enhanced SOM), has demonstrated to improve the temporal index without decreasing the quantization error.},
}

@inproceedings{Vesanto:1999,
	Author = {Vesanto, J. and Ahola, J.},
	Booktitle = {Proceedings of the International ICSC Congress on Computational Intelligence Methods and Applications (CIMA '99)},
	Editor = {Bothe, H. and Oja, E. and Massad, E. and Haefke, C.},
	Pages = {279--285},
	Publisher = {ICSC Academic Press},
	Title = {Hunting for Correlations in Data Using the Self-Organizing Map},
	Url = {http://www.cis.hut.fi/projects/ide/publications/papers/aida99b.zip},
	Year = {1999},
	Abstract = {The Self-Organizing Map (SOM) is an efficient tool for visualization of multidimensional numerical data. One of the tasks it is used for is correlation hunting. In this paper we present a simple method to enhance correlation hunting in the case of a large number of variables. Different variations of the method - component plane reorganization - are evaluated on a complex test data. The purpose is to somewhat validate the use of SOM in correlation hunting and to evaluate the strengths and weaknesses of different reorganization procedures. A case with a real world data is also presented to show the usefulness of the method.},
}

@article{Kaski:1996,
	Author = {Kaski, S. and Lagus, K.},
	Doi = {10.1007/3-540-61510-5_136},
	Journal = {Proceedings of ICANN},
	Pages = {809--814},
	Publisher = {Springer},
	Title = {Comparing self-organizing maps},
	Url = {http://www.cis.hut.fi/~sami/papers/critfin.ps},
	Volume = {96},
	Year = {1996},
	Abstract = {In exploratory analysis of high-dimensional data the self-organizing map can be used to illustrate relations between the data items. We have developed two measures for comparing how different maps represent these relations. The other combines an index of discontinuities in the mapping from the input data set to the map grid with an index of the accuracy with which the map represents the data set. This measure can be used for determining the goodness of single maps. The other measure has been used to directly compare how similarly two maps represent relations between data items. Such a measure of the dissimilarity of maps is useful, e.g., for analyzing the sensitivity of maps to variations in their inputs or in the learning process. Also the similarity of two data sets can be compared indirectly by comparing the maps that represent them.
},
}

@article{Hashimoto:2007,
	Author = {Hashimoto, S.},
	Doi = {10.1080/09298210701859289},
	Journal = {Journal of New Music Research},
	Keywords = {evaluation, creativity, interfaces, self-organizing map, wii},
	Number = {3},
	Pages = {197--205},
	Publisher = {Routledge},
	Title = {Evaluation Issue of KANSEI Technology and Sound and Music Computing Projects at {W}aseda {U}niversity},
	Volume = {36},
	Year = {2007},
	Abstract = {Information technology can be classified into three categories; physical signal processing, semantic symbol processing and KANSEI (emotional) information processing. The technology for art belongs to the last one. The technology to access human emotion cannot be evaluated in the same way as the one for industrial applications. We need a sort of multilateral standard method to evaluate technical research on music and art that requires both universal and individual viewpoints. This paper begins with the evaluation issue of KANSEI technology related to music. Then our recent projects on music computing promoted by the Japan Science and Technology Agency are introduced.},
}

@inproceedings{Mulier:1994,
	Author = {Mulier, F. and Cherkassky, V.},
	Booktitle = {Proceedings of the 12th International Conference on Pattern Recognition},
	Doi = {10.1109/ICPR.1994.576908},
	Title = {Learning rate schedules for self-organizing maps},
	Volume = {2},
	Year = {1994},
	Abstract = {Kohonen maps have been successfully applied for data reduction and density approximation. Unfortunately, the choice of the neighborhood function and the learning rate in the Kohonen model remains empirical. We present a new statistically motivated approach to determine the contribution of each data presentation during training on the final position of the units of the trained map. Experimental results show that employing the commonly used learning rates leads to unit locations which are overly influenced by the later presentations (i.e., last 20% of data points in the finite training set). Better learning rate schedules and neighborhood functions are then determined which allow more uniform contributions of the training data on the unit locations. These improved rates are shown to be a suitable generalization of the standard rates given by stochastic approximation theory for a self-organizing map of units},
}

@book{Chatfield:1983,
	Author = {Chatfield, C.},
	Edition = {2nd},
	Publisher = {Chapman {\&} Hall},
	Title = {Statistics for Technology: A Course in Applied Statistics},
	Year = {1978}}

@book{Lischner:2003,
	Author = {Lischner, R.},
	Editor = {Gennick, J.},
	Publisher = {O'Reilly},
	Title = {{C++} in a Nutshell},
	Year = {2003}}

@inproceedings{Fradkin:2003,
	Author = {Fradkin, D. and Madigan, D.},
	Booktitle = {Proceedings of the ninth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
	Doi = {10.1145/956750.956812},
	Pages = {517--522},
	Publisher = {ACM Press New York, NY, USA},
	Title = {Experiments with random projections for machine learning},
	Url = {http://www.stat.rutgers.edu/~madigan/PAPERS/rp.pdf},
	Year = {2003},
	Abstract = {Dimensionality reduction via Random Projections has attracted considerable attention in recent years. The approach has interesting theoretical underpinnings and offers computational advantages. In this paper we report a number of experiments to evaluate Random Projections in the context of inductive supervised learning. In particular, we compare Random Projections and PCA on a number of different datasets and using different machine learning methods. While we find that the random projection approach predictively underperforms PCA, its computational advantages may make it attractive for certain applications.
},
}

@article{Iverson:1993,
	Author = {Iverson, P. and Krumhansl, C.L.},
	Doi = {10.1121/1.407371},
	Journal = {The Journal of the Acoustical Society of America},
	Pages = {2595},
	Publisher = {ASA},
	Title = {Isolating the dynamic attributes of musical timbre},
	Volume = {94},
	Year = {1993},
	Abstract = {Three experiments examined the dynamic attributes of timbre by evaluating the role of onsets in similarity judgments. In separate experiments, subjects heard complete orchestral instrument tones, the onsets of those tones, and tones with the onsets removed (``remainders''). Ratings for complete tones corresponded to those for onsets, indicating that the salient acoustic attributes for complete tones are present at the onset. Ratings for complete tones also corresponded to those for remainders, indicating that the salient attributes for complete tones are present also in the absence of onsets. Subsequent acoustic analyses demonstrated that this pattern of similarity was due to the centroid frequencies and amplitude envelopes of the tones. The results indicate that the dynamic attributes of timbre are not only present at the onset, but also throughout, and that multiple acoustic attributes may contribute to the same perceptual dimensions.},
}

@inproceedings{Artac:2002,
	Author = {Artac, M. and Jogan, M. and Leonardis, A.},
	Booktitle = {Proc. International Conference on Pattern Recognition (ICPR 2002)},
	Doi = {10.1109/ICPR.2002.1048133},
	Keywords = {online, machine learning},
	Pages = {781--784},
	Title = {Incremental {PCA} for on-line visual learning and recognition},
	Url = {http://ieeexplore.ieee.org/search/wrapper.jsp?arnumber=1048133},
	Volume = {3},
	Year = {2002},
	Abstract = {The methods for visual learning that compute a space of eigenvectors by Principal Component Analysis (PCA) traditionally require a batch computation step. Since this leads to potential problems when dealing with large sets of images, several incremental methods for the computation of the eigenvectors have been introduced. However, such learning cannot be considered as an on-line process, since all the images are retained until the final step of computation of space of eigenvectors, when their coefficients in this subspace are computed. In this paper we propose a method that allows for simultaneous learning and recognition. We show that we can keep only the coefficients of the learned images and discard the actual images and still are able to build a model of appearance that is fast to compute and open-ended. We performed extensive experimental testing which showed that the recognition rate and reconstruction accuracy are comparable to those obtained by the batch method.
},
}

@inbook{Krumhansl:1989,
	Address = {Amsterdam},
	Author = {Krumhansl, C. L.},
	Chapter = {Why is musical timbre so hard to understand?},
	Editor = {Nielzen, S. and Olsson, O.},
	Number = {846},
	Pages = {43--53},
	Publisher = {Elsevier},
	Series = {Excerpta Medica},
	Title = {Structure and perception of electroacoustic sound and music},
	Year = {1989}}

@article{Caclin:2006,
	Author = {Caclin, A. and Brattico, E. and Tervaniemi, M. and N{\"a}{\"a}t{\"a}nen, R. and Morlet, D. and Giard, M.-H. and McAdams, S.},
	Doi = {10.1162/jocn.2006.18.12.1959},
	Issn = {0898929X},
	Journal = {Journal of Cognitive Neuroscience},
	Keywords = {MEMORY, COGNITIVE ability, NEURAL stimulation, PERCEPTION, SOUND, NEUROSCIENCES},
	Month = {Dec},
	Number = {12},
	Pages = {1959--1972},
	Title = {Separate Neural Processing of Timbre Dimensions in Auditory Sensory Memory},
	Url = {http://0-search.ebscohost.com.catalogue.ulrls.lon.ac.uk/login.aspx?direct=true&db=aph&AN=23263677&site=ehost-live},
	Volume = {18},
	Year = {2006},
	Abstract = {Timbre is a multidimensional perceptual attribute of complex tones that characterizes the identity of a sound source. Our study explores the representation in auditory sensory memory of three timbre dimensions (acoustically related to attack time, spectral centroid, and spectrum fine structure), using the mismatch negativity (MMN) component of the auditory event-related potential. MMN is elicited by a discriminable change in a sound sequence and reflects the detection of the discrepancy between the current stimulus and traces in auditory sensory memory. The stimuli used in the present study were carefully controlled synthetic tones. MMNs were recorded after changes along each of the three timbre dimensions and their combinations. Additivity of unidimensional MMNs and dipole modeling results suggest partially separate MMN generators for different timbre dimensions, reflecting their mainly separate processing in auditory sensory memory. The results expand to timbre dimensions a property},
}

@inbook{Burgoyne:2008,
	Address = {Berlin},
	Author = {Burgoyne, A. and McAdams, S.},
	Chapter = {A meta-analysis of timbre perception using nonlinear extensions to {CLASCAL}},
	Editor = {Kronland-Martinet, R. and Ystad, S. and Jensen, K.},
	Publisher = {Springer},
	Title = {Sense of Sounds},
	Year = {forthcoming},
	Abstract = {Seeking to identify the constituent parts of the multidimensional auditory attribute that musicians know as timbre, music psychologists have made extensive use of multidimensional scaling (mds), a statistical technique for visualising the geometric spaces implied by perceived dissimilarity. mds is also well known in the machine learning community, where it is used as a basic technique for dimensionality reduction. We adapt a nonlinear variant of mds that is popular in machine learning, Isomap, for use in analysing psychological data and re-analyse three earlier experiments on human perception of timbre. Isomap is designed to eliminate undesirable nonlinearities in the input data in order to reduce the overall dimensionality; our results show that it succeeds in these goals for timbre spaces, compressing the output onto well-known dimensions of timbre and highlighting the challenges inherent in quantifying differences in spectral shape. 
},
}

@inbook{Stowell:2008c,
	Address = {Cambridge, MA},
	Author = {Stowell, D.},
	Chapter = {Writing Unit Generator Plugins},
	Editor = {Wilson, S. and Cottle, D. and Collins, N.},
	Publisher = {MIT Press},
	Title = {The {S}uper{C}ollider Book},
	Year = {in press}}

@inproceedings{Stowell:2008b,
	Author = {Stowell, D. and Plumbley, M. D.},
	Booktitle = {Proceedings of the 11th Conference on Digital Audio Effects (DAFx-08)},
	Keywords = {voice, vocal, timbre, robust, features, entropy, singing, speech, beatboxing},
	Title = {Robustness and independence of voice timbre features under live performance acoustic degradations},
	Year = {2008}}

@article{McAdams:1995,
	Author = {McAdams, S. and Winsberg, S. and Donnadieu, S. and Soete, G. and Krimphoff, J.},
	Doi = {10.1007/BF00419633},
	Journal = {Psychological Research},
	Keywords = {timbre, perception},
	Number = {3},
	Pages = {177--192},
	Publisher = {Springer},
	Title = {Perceptual scaling of synthesized musical timbres: Common dimensions, specificities, and latent subject classes},
	Url = {http://www.springerlink.com/content/t14w012k8249n58x/},
	Volume = {58},
	Year = {1995},
	Abstract = {To study the perceptual structure of musical timbre and the effects of musical training, timbral dissimilarities of synthesized instrument sounds were rated by professional musicians, amateur musicians, and nonmusicians. The data were analyzed with an extended version of the multidimensional scaling algorithm CLASCAL (Winsberg {\&} De Soete, 1993), which estimates the number of latent classes of subjects, the coordinates of each timbre on common Euclidean dimensions, a specificity value of unique attributes for each timbre, and a separate weight for each latent class on each of the common dimensions and the set of specificities. Five latent classes were found for a three-dimensional spatial model with specificities. Common dimensions were quantified psychophysically in terms of log-rise time, spectral centroid, and degree of spectral variation. The results further suggest that musical timbres possess specific attributes not accounted for by these shared perceptual dimensions. Weight patterns indicate that perceptual salience of dimensions and specificities varied across classes. A comparison of class structure with biographical factors associated with degree of musical training and activity was not clearly related to the class structure, though musicians gave more precise and coherent judgments than did nonmusicians or amateurs. The model with latent classes and specificities gave a better fit to the data and made the acoustic correlates of the common dimensions more interpretable.
},
}

@book{Mendenhall:1981,
	Author = {Mendenhall, W. and Wackerly, D. D. and Scheaffer, R. L.},
	Edition = {Fourth},
	Keywords = {statistics},
	Publisher = {PWS-Kent},
	Title = {Mathematical statistics with applications},
	Year = {1989},
}

@inproceedings{Kiviluoto:1996,
	Author = {Kiviluoto, K.},
	Booktitle = {Proc. International Conference on Neural Networks},
	Doi = {10.1109/ICNN.1996.548907},
	Pages = {294--299},
	Title = {Topology preservation in {S}elf-{O}rganizing {M}aps},
	Volume = {1},
	Year = {1996},
	Abstract = {his paper concentrates on the following issues: (1) discussion on what kind of mapping is produced by the SOM algorithm; (2) introduction of a quantitative measure of continuity for the mapping produced by SOM; (3) introduction of a variant of SOM, called the AdSOM, with locally adapting neighborhood radii.},
}

@inproceedings{Dajer:2005,
	Author = {Dajer, M. E. and Pereira, J. C. and Maciel, C. D.},
	Booktitle = {Proceedings of the Seventh IEEE International Symposium on Multimedia},
	Doi = {10.1109/ISM.2005.84},
	Pages = {765--771},
	Publisher = {IEEE Computer Society Washington, DC, USA},
	Title = {Nonlinear Dynamical Analysis of Normal Voices},
	Year = {2005},
	Abstract = {Human voice has been the focus of study for different areas of sciences. Researches in the last two decades have established the existence of chaos in human voice production. The purpose of this paper is to use nonlinear dynamics methods in the analysis of normal voices from healthy subjects and correlate them to traditional acoustic parameters as well as perceptual analysis. Twelve human voice signals from healthy subjects, 6 males and 6 females, ranging in age from 19 to 39 years old were used. Sustained vowel sounds /a/, /e/ and /i/, from brazilian Portuguese were recorded at a sampling rate of 22,050 Hz and analyzed in order to obtain acoustic perturbation measures (jitter, shimmer, coefficient of excess - EX, and Pitch amplitude - PA). The phase space reconstruction method was used to describe the nonlinear dynamic characteristics of voice signal samples. This paper shows that non-linear dynamical methods as phase space reconstruction seems to be a suitable technique for voice signals analysis, due to the chaotic component of the human voice. The results suggest that non-linear dynamic analysis does not replace existing techniques instead, they may improve and complement the recent voice analysis methods available for health professionals, speech therapist and clinician.},
}

@article{Margulis:2006,
	Author = {Margulis, E. and Levine, W.},
	Doi = {10.1080/09298210600835042},
	Journal = {Journal of New Music Research},
	Number = {2},
	Pages = {175--182},
	Publisher = {Routledge},
	Title = {Timbre priming effects and expectation in melody},
	Volume = {35},
	Year = {2006},
	Abstract = {In this study, participants identified the timbre of pitches when they occurred in isolation, and again when they occurred appended to short melodies. For pitches congruent with the melody, timbre identification generally improved when the pitches were appended to the melody in comparison to when they occurred in isolation. In addition, the amount of improvement was broadly consistent with theoretical accounts of the degree to which the pitches were expected, given the preceding melody. This finding relates both to proposed interactions in processing between pitch and timbre, and to theoretical work regarding melodic expectations. It suggests that melodic expectations can be revealed implicitly, and is consistent with the idea that they operate at a relatively early stage of perceptual processing. In this study, priming effects were shown in listeners without musical training, demonstrating that expectations can develop in response to passive exposure to music, not only in response to formal training.},
}

@inproceedings{Kangas:1989,
	Author = {Kangas, J. and Kohonen, T. and Laaksonen, J. and Simula, O. and Venta, O.},
	Booktitle = {Proc. International Joint Conference on Neural Networks},
	Doi = {10.1109/IJCNN.1989.118292},
	Pages = {517--522},
	Title = {Variants of self-organizing maps},
	Year = {1989},
	Abstract = {Self-organizing maps have a connection with traditional vector quantization. A characteristic which makes them resemble certain biological brain maps, however, is the spatial order of their responses which is formed in the learning process. Two innovations are discussed: dynamic weighting of the input signals at each input of each cell, which improves the ordering when very different input signals are used, and definition of neighborhoods in the learning algorithm by the minimum spanning tree, which provides a far better and faster approximation of prominently structured density functions. It is cautioned that if the maps are used for pattern recognition and decision processes, it is necessary to fine-tune the reference vectors such that they directly define the decision borders.},
}

@inproceedings{Ritter:1988,
	Author = {Ritter, H. and Schulten, K.},
	Booktitle = {Proc. International Conference on Neural Networks},
	Doi = {10.1109/ICNN.1988.23838},
	Pages = {109--116},
	Title = {{K}ohonen's self-organizing maps: exploring their computational capabilities},
	Year = {1988},
	Abstract = {The authors demonstrate that the computational capabilities of Kohonen's algorithm provide an unified approach to such diverse fields as sensory mappings, combinatorial optimization, and learning in motor control. For a discrete probability distribution of the training inputs, the formation of the mapping can be described as a probabilistic descent in a potential. In view of their wide applicability, the principles of the algorithm might also be inherent to the maturation of biological brains and could help to achieve a better understanding of these processes from a more unified point of view},
}

@inbook{Kybic:2007,
	Author = {Kybic, J.},
	Chapter = {High-Dimensional Entropy Estimation for Finite Accuracy Data: {R-NN} Entropy Estimator},
	Doi = {10.1007/978-3-540-73273-0_47},
	Pages = {569--580},
	Publisher = {Springer Berlin / Heidelberg},
	Series = {Lecture Notes in Computer Science},
	Title = {Information Processing in Medical Imaging},
	Year = {2007},
	Abstract = {We address the problem of entropy estimation for high-dimensional finite-accuracy data. Our main application is evaluating high-order mutual information image similarity criteria for multimodal image registration. The basis of our method is an estimator based on k-th nearest neighbor (NN) distances, modified so that only distances greater than some constant R are evaluated. This modification requires a correction which is found numerically in a preprocessing step using quadratic programming. We compare experimentally our new method with k-NN and histogram estimators on synthetic data as well as for evaluation of mutual information for image similarity.
},
}

@article{Victor:2002,
	Author = {Victor, J.D.},
	Doi = {10.1103/PhysRevE.66.051903},
	Journal = {Physical Review E},
	Keywords = {entropy, entropy estimators, nearest neighbour},
	Number = {5},
	Pages = {51903},
	Publisher = {APS},
	Title = {Binless strategies for estimation of information from neural data},
	Volume = {66},
	Year = {2002},
	Abstract = {We present an approach to estimate information carried by experimentally observed neural spike trains elicited by known stimuli. This approach makes use of an embedding of the observed spike trains into a set of vector spaces, and entropy estimates based on the nearest-neighbor Euclidean distances within these vector spaces [L. F. Kozachenko and N. N. Leonenko, Probl. Peredachi Inf. 23, 9 (1987)]. Using numerical examples, we show that this approach can be dramatically more efficient than standard bin-based approaches such as the ``direct'' method [S. P. Strong, R. Koberle, R. R. de Ruyter van Steveninck, and W. Bialek, Phys. Rev. Lett. 80, 197 (1998)] for amounts of data typically available from laboratory experiments.},
}

@article{McKinney:2003,
	Author = {McKinney, M. F. and Breebaart, J.},
	Journal = {Proc. Int. Symposium on Music Information Retrieval (ISMIR)},
	Pages = {151--158},
	Title = {Features for audio and music classification},
	Year = {2003},
	Abstract = {Four audio feature sets are evaluated in their ability to classify five general audio classes and seven popular music genres. The feature sets include low-level signal properties, mel-frequency spectral coefficients, and two new sets based on perceptual models of hearing. The temporal behavior of the features is analyzed and parameterized and these parameters are included as additional features. Using a standard Gaussian framework for classification, results show that the temporal behavior of features is important for both music and audio classification. In addition, classification is better, on average, if based on features from models of auditory perception rather than on standard features. 
}}

@techreport{Stowell:2008a,
	Author = {Stowell, D. and Plumbley, M. D.},
	Institution = {Dept. of Electronic Engineering, Queen Mary, University of London},
	Number = {C4DM-TR-08-01},
	Title = {Characteristics of the beatboxing vocal style},
	Year = {2008}}

@article{Ramalingam:2006,
	Author = {Ramalingam, A.K.},
	Doi = {10.1109/TIFS.2006.885036},
	Journal = {Information Forensics and Security, IEEE Transactions on},
	Keywords = {spectral crest},
	Number = {4},
	Pages = {457--463},
	Title = {Gaussian {M}ixture {M}odeling of {S}hort-{T}ime {F}ourier {T}ransform Features for Audio Fingerprinting},
	Volume = {1},
	Year = {2006},
	Abstract = {In audio fingerprinting, an audio clip must be recognized by matching an extracted fingerprint to a database of previously computed fingerprints. The fingerprints should reduce the dimensionality of the input significantly, provide discrimination 
among different audio clips, and, at the same time, be invariant to distorted versions of the same audio clip. In this paper, we design fingerprints addressing the above issues by modeling an audio clip by Gaussian mixture models (GMM). We evaluate the performance of many easy-to-compute short-time Fourier transform features, such as Shannon entropy, R{\'e}nyi entropy, spectral centroid, spectral bandwidth, spectral flatness measure, spectral crest factor, and Mel-frequency cepstral coefficients in modeling audio clips using GMM for fingerprinting. We test the robustness of the fingerprints under a large number of distortions. To make the system robust, we use some of the distorted versions of the audio for training. However, we show that the audio fingerprints modeled using GMM are not only robust to the distortions used in training but also to 
distortions not used in training. Among the features tested, spectral centroid performs best with an identification rate of 99.2% at a false positive rate of 10^-4. All of the features give an identification rate of more than 90% at a false positive rate of 10^-3. 
},
}

@article{Hosseinzadeh:2008,
	Author = {Hosseinzadeh, D. and Krishnan, S.},
	Doi = {10.1155/2008/258184},
	Journal = {EURASIP Journal on Advances in Signal Processing},
	Keywords = {spectral crest},
	Title = {On the Use of Complementary Spectral Features for Speaker Recognition},
	Volume = {2008 (Article ID 258184, 10 pages)},
	Year = {2008},
	Abstract = {The most popular features for speaker recognition are Mel frequency cepstral coefficients (MFCCs) and linear prediction cepstral coefficients (LPCCs). These features are used extensively because they characterize the vocal tract configuration which is known to be highly speaker-dependent. In this work, several features are introduced that can characterize the vocal system in order to complement the traditional features and produce better speaker recognition models. The spectral centroid (SC), spectral bandwidth (SBW), spectral band energy (SBE), spectral crest factor (SCF), spectral flatness measure (SFM), Shannon entropy (SE), and Renyi entropy (RE) were utilized for this purpose. This work demonstrates that these features are robust in noisy conditions by simulating some common distortions that are found in the speakers' environment and a typical telephone channel. Babble noise, additive white Gaussian noise (AWGN), and a bandpass channel with 1 dB of ripple were used to simulate these noisy conditions. The results show significant improvements in classification performance for all noise conditions when these features were used to complement the MFCC and ΔMFCC features. In particular, the SC and SCF improved performance in almost all noise conditions within the examined SNR range (10--40 dB). For example, in cases where there was only one source of distortion, classification improvements of up to 8% and 10% were achieved under babble noise and AWGN, respectively, using the SCF feature.},
}

@article{Laflen:2008,
	Author = {Laflen, J. B. and Lazarus, C. L. and Amin, M. R.},
	Isi = {ISI:000253324100003},
	Issn = {0003-4894},
	Journal = {ANNALS OF OTOLOGY RHINOLOGY AND LARYNGOLOGY},
	Keywords = {YIN, pitch estimation, voice, Voice quality},
	Month = {Feb},
	Number = {2},
	Pages = {90--97},
	Title = {Pitch deviation analysis of pathological voice in connected speech},
	Volume = {117},
	Year = {2008},
	Abstract = {Objectives: This study compares normal and pathologic voices using a novel voice analysis algorithm that examines pitch deviation during connected speech. The study evaluates the clinical potential of the algorithm as a mechanism to distinguish between normal and pathologic voices using connected speech.

Methods: Adult vocalizations from normal subjects and patients with known benign free-edge vocal fold lesions were analyzed. Recordings had been previously obtained in quiet under controlled conditions. Two phrases and sustained /a/ were recorded per subject. The subject populations consisted of 10 normal and 31 abnormal subjects. The voice analysis algorithm generated 2-dimensional patterns that represent pitch deviation in time and under variable window widths. Measures were collected from these patterns for window widths between 10 and 250 ms. For comparison, jitter and shimmer measures were collected from sustained /a/ by means of the Computerized Speech Lab (CSL). A t-test and tests of sensitivity and specificity assessed discrimination between normal and abnormal populations.

Results: More than 58% of the measures collected from connected speech outperformed the CSL jitter and shimmer measures in population discrimination. Twenty-five percent of the experimental measures (including /a/) indicated significantly different populations (p <.01%).

Conclusions: The results demonstrate that the algorithm distinguishes between normal and abnormal populations by use of samples of connected speech.

},
}

@article{Bentley:1975,
	Author = {Bentley, J. L.},
	Doi = {10.1145/361002.361007},
	Journal = {Communications of the ACM},
	Keywords = {k-d tree, kd tree},
	Number = {9},
	Pages = {509--517},
	Title = {Multidimensional binary search trees used for associative searching},
	Volume = {18},
	Year = {1975},
}

@article{Vasicek:1976,
	Author = {Vasicek, O.},
	Journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
	Number = {1},
	Pages = {54--59},
	Publisher = {JSTOR},
	Title = {A Test for Normality Based on Sample Entropy},
	Volume = {38},
	Year = {1976},
	Abstract = {A test of the composite hypothesis of normality is introduced. The test is based on the property of the normal distribution that its entropy exceeds that of any other distribution with a density that has the same variance. The test statistic is based on a class of estimators of entropy constructed here. The test is shown to be a consistent test of the null hypothesis for all alternatives without a singular continuous part. The power of the test is estimated against several alternatives. It is observed that the test compares favourably with other tests for normality.}}

@inproceedings{Kybic:2006,
	Author = {Kybic, J.},
	Booktitle = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing (ICASSP'06)},
	Doi = {10.1109/ICASSP.2006.1660776},
	Keywords = {entropy estimators},
	Title = {Incremental Updating of Nearest Neighbor-Based High-Dimensional Entropy Estimation},
	Volume = {3},
	Year = {2006},
	Abstract = {We present an algorithm for estimating entropy from high-dimensional data based on Kozachenko-Leonenko nearest neighbor estimator. The problem of finding all nearest neighbors is approximately solved using a best-bin first (BBF) bottom-up k-D tree traversal. Our main application is evaluating higher-order mutual information (MI) image similarity criteria that, unlike standard scalar MI, are directly usable for vector (e.g. color) images and can take into account neighborhood information. As during the optimization the MI criterion is often evaluated for very similar images, it is advantageous to update the k-D tree incrementally. We show that the resulting algorithm is fast and accurate enough to be practical for the image registration application.},
}

@article{Browne:2007,
	Author = {Browne, M.},
	Doi = {10.1016/j.patcog.2006.05.012 },
	Journal = {Pattern Recognition},
	Keywords = {entropy estimators},
	Number = {1},
	Pages = {134--140},
	Publisher = {Elsevier},
	Title = {A geometric approach to non-parametric density estimation},
	Volume = {40},
	Year = {2007},
	Abstract = {A novel non-parametric density estimator is developed based on geometric principles. A penalised centroidal Voronoi tessellation forms the basis of the estimator, which allows the data to self-organise in order to minimise estimate bias and variance. This approach is a marked departure from usual methods based on local averaging, and has the advantage of being naturally adaptive to local sample density (scale-invariance). The estimator does not require the introduction of a plug-in kernel, thus avoiding assumptions of symmetricity and morphology. A numerical experiment is conducted to illustrate the behaviour of the estimator, and it's characteristics are discussed.},
}

@techreport{Learned-Miller:2004,
	Author = {Learned-Miller, E. G.},
	Institution = {University of Massachusets},
	Number = {04-104},
	Title = {Hyperspacings and the Estimation of Information Theoretic Quantities},
	Url = {http://www.cs.umass.edu/~elm/papers/04-104.pdf},
	Year = {2004},
	Abstract = {The estimation of probability densities from data is widely used as an intermediate step in the estimation of entropy, Kullback-Leibler (KL) divergence, and mutual information, and for statistical tasks such as hypothesis testing. We propose an alternative to density estimation--partitioning a space into regions whose approximate probability mass is known--that can be used for the same purposes. We call these regions hyperspacings, a generalization of spacings in one dimension. After discussing one-dimensional spacings estimates of entropy and KL-divergence, we show how hyperspacings can be used to estimate these quantities (and mutual information) in higher dimensions. Our approach outperforms certain widely used estimators based on intermediate density estimates. Using similar ideas, we also present a new distribution-free hypothesis test for distributional equivalence that compares favorably with the Kolmogorov-Smirnov test. Using hyperspacings, it is easily extended to multiple dimensions. 
},
}

@inproceedings{Learned-Miller:2003a,
	Author = {Learned-Miller, E. G.},
	Booktitle = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing (ICASSP'03)},
	Doi = {10.1109/ICASSP.2003.1199463},
	Issn = {1520-6149 },
	Keywords = {entropy, parameter estimation, probability, statistical analysis asymptotically efficient 1D estimators, continuous probability density, density estimation, entropy estimators, m-spacing estimators, multi-dimensional probability densities, one-dimensional probability densities, order statistics, plug-in estimators, polynomial estimators, sample size},
	Month = {April},
	Number = {III},
	Pages = {297--300},
	Title = {A new class of entropy estimators for multi-dimensional densities},
	Volume = {3},
	Year = {2003},
	Abstract = {We present a new class of estimators for approximating the entropy of multi-dimensional probability densities based on a sample of the density. These estimators extend the classic ``m-spacing'' estimators of Vasicek (1976) and others for estimating entropies of one-dimensional probability densities. Unlike plug-in estimators of entropy, which first estimate a probability density and then compute its entropy. our estimators avoid the difficult intermediate step of density estimation. For fixed dimension. the estimators an polynomial in the sample size. Similarities to consistent and asymptotically efficient one-dimensional estimators of entropy suggest that our estimators may sham these properties.},
}

@book{Cover:2006,
	Author = {Cover, T.M. and Thomas, J.A.},
	Keywords = {entropy, differential entropy, information theory},
	Publisher = {Wiley-Interscience New York},
	Title = {Elements of Information Theory},
	Url = {http://www.matf.bg.ac.yu/nastavno/viktor/Differential_Entropy.pdf},
	Year = {2006},
}

@article{Learned-Miller:2003,
	Author = {Learned-Miller, E. G. and Fisher, III, J. W.},
	Journal = {Journal of Machine Learning Research},
	Keywords = {entropy, order statistics},
	Number = {1271-1295},
	Pages = {1--2},
	Publisher = {MIT Press},
	Title = {{ICA} using spacings estimates of entropy},
	Volume = {4},
	Year = {2003},
	Abstract = {This paper presents a new algorithm for the independent components analysis (ICA) problem based on efficient spacings estimates of entropy. Like many previous methods, we minimize a standard measure of the departure from independence, the estimated Kullback-Leibler divergence between a joint distribution and the product of its marginals. To do this, we use a consistent and rapidly converging entropy estimator due to Vasicek. The resulting algorithm is simple, computationally efficient, intuitively appealing, and outperforms other well known algorithms. In addition, the estimator and the resulting algorithm exhibit excellent robustness to outliers. We present favorable comparisons to Kernel ICA, FAST-ICA, JADE, and extended Infomax in extensive simulations. 
},
}

@article{Paninski:2003,
	Author = {Paninski, L.},
	Journal = {Neural Computation},
	Number = {6},
	Pages = {1191--1253},
	Publisher = {MIT Press},
	Title = {Estimation of Entropy and Mutual Information},
	Volume = {15},
	Year = {2003},
}

@article{Miller:1955,
	Author = {Miller, G.A.},
	Journal = {Information Theory in Psychology: Problems and Methods},
	Keywords = {entropy, information theory},
	Pages = {95--100},
	Title = {Note on the bias of information estimates},
	Year = {1955},
}

@article{Eronen:2006,
	Author = {Eronen, A. J. and Peltonen, V. T. and Tuomi, J. T. and Klapuri, A. P. and Fagerlund, S. and Sorsa, T. and Lorho, G. and Huopaniemi, J.},
	Doi = {10.1109/TSA.2005.854103},
	Journal = {IEEE Transactions on Audio, Speech and Language Processing},
	Keywords = {timbre, feature selection},
	Month = {Jan},
	Number = {1},
	Pages = {321--329},
	Title = {Audio-based context recognition},
	Url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1561288},
	Volume = {14},
	Year = {2006},
	Abstract = {The aim of this paper is to investigate the feasibility of an audio-based context recognition system. Here, context recognition refers to the automatic classification of the context or an environment around a device. A system is developed and compared to the accuracy of human listeners in the same task. Particular emphasis is placed on the computational complexity of the methods, since the application is of particular interest in resource-constrained portable devices. Simplistic low-dimensional feature vectors are evaluated against more standard spectral features. Using discriminative training, competitive recognition accuracies are achieved with very low-order hidden Markov models (1-3 Gaussian components). Slight improvement in recognition accuracy is observed when linear data-driven feature transformations are applied to mel-cepstral features. The recognition rate of the system as a function of the test sequence length appears to converge only after about 30 to 60 s. Some degree of accuracy can be achieved even with less than 1-s test sequence lengths. The average reaction time of the human listeners was 14 s, i.e., somewhat smaller, but of the same order as that of the system. The average recognition accuracy of the system was 58% against 69%, obtained in the listening tests in recognizing between 24 everyday contexts. The accuracies in recognizing six high-level classes were 82% for the system and 88% for the subjects.
},
}

@article{Shannon:1995,
	Author = {Shannon, R. V. and Zeng, F.-G. and Kamath, V. and Wygonski, J. and Ekelid, M.},
	Doi = {10.1126/science.270.5234.303},
	Eprint = {http://www.sciencemag.org/cgi/reprint/270/5234/303.pdf},
	Journal = {Science},
	Keywords = {prosody, speech perception, speech, perception},
	Month = {Oct},
	Number = {5234},
	Pages = {303--304},
	Title = {Speech Recognition with Primarily Temporal Cues},
	Url = {http://www.sciencemag.org/cgi/content/abstract/270/5234/303},
	Volume = {270},
	Year = {1995},
	Abstract = {Nearly perfect speech recognition was observed under conditions of greatly reduced spectral information. Temporal envelopes of speech were extracted from broad frequency bands and were used to modulate noises of the same bandwidths. This manipulation preserved temporal envelope cues in each band but restricted the listener to severely degraded information on the distribution of spectral energy. The identification of consonants, vowels, and words in simple sentences improved markedly as the number of bands increased; high speech recognition performance was obtained with only three bands of modulated noise. Thus, the presentation of a dynamic temporal pattern in only a few broad spectral regions is sufficient for the recognition of speech.
},
}

@article{Kendall:1939,
	Author = {Kendall, M. G. and Smith, B. B.},
	Journal = {The Annals of Mathematical Statistics},
	Month = {Sep},
	Number = {3},
	Pages = {275--287},
	Title = {The Problem of {m} Rankings},
	Url = {http://links.jstor.org/sici?sici=0003-4851%28193909%2910%3A3%3C275%3ATPOR%3E2.0.CO%3B2-Q},
	Volume = {10},
	Year = {1939},
	Abstract = {No abstract
},
}

@article{Goebl:2004,
	Author = {Goebl, Werner},
	Doi = {doi:10.1080/0929821042000317804},
	Journal = {Journal of New Music Research},
	Month = {September},
	Pages = {203-216(14)},
	Title = {Computational Models of Expressive Music Performance: The State of the Art},
	Url = {http://www.ingentaconnect.com/content/routledg/jnmr/2004/00000033/00000003/art00003},
	Volume = {33},
	Year = {2004},
	Abstract = {This contribution gives an overview of the state of the art in the field of computational modeling of expressive music performance. The notion of predictive computational model is briefly discussed, and a number of quantitative models of various aspects of expressive performance are briefly reviewed. Four selected computational models are reviewed in some detail. Their basic principles and assumptions are explained and, wherever possible, empirical evaluations of the models on real performance data are reported. In addition to these models, which focus on general, common principles of performance, currently ongoing research on the formal characterisation of differences in individual performance style are briefly presented.},
}

@article{Depalle:2006,
	Author = {Depalle, Philippe},
	Doi = {doi:10.1080/09298210600696881},
	Journal = {Journal of New Music Research},
	Pages = {71-93(23)},
	Title = {Mapping strategies for gestural and adaptive control of digital audio effects},
	Url = {http://www.ingentaconnect.com/content/routledg/jnmr/2006/00000035/00000001/art00007},
	Volume = {35},
	Year = {March 2006},
	Abstract = {This paper discusses explicit mapping strategies for gestural and adaptive control of digital audio effects. We address the problem of defining what is the control and what is the effect. We then propose a mapping strategy derived from mapping techniques used in sound synthesis. The explicit mapping strategy we developed has two levels and two layers for each level: the first level is the adaptive control with a feature combination layer and a control signal conditioning layer; the second level is the gestural control layer. We give musical examples that illustrate the interest of this strategy.},
}

@article{Juhasz:2006,
	Author = {Juhasz, Zoltan},
	Doi = {doi:10.1080/09298210600834912},
	Journal = {Journal of New Music Research},
	Pages = {95-112(18)},
	Title = {A systematic comparison of different European folk music traditions using self-organizing maps},
	Url = {http://www.ingentaconnect.com/content/routledg/jnmr/2006/00000035/00000002/art00001},
	Volume = {35},
	Year = {June 2006},
	Abstract = {The relations of six European musical cultures were investigated using self-organizing mapping of the melody contours. The high number of contour types characterizing more than 2 cultures simultaneously led to the supposition of a hypothetical common musical language, and the corresponding contour types were determined using a self-organizing map, being able to understand the six cultures in parallel. The analysis showed that significant parts of the common language are represented in the six national cultures, and it exists practically completely in Hungarian and Slovak folk music. The mapping of the melody sections and the contour types of the common language to a multidimensional melody space resulted in a clear musical description of the contacts, thus, the hypothesis of an archaic common musical tradition in Europe seems to be worth considering.},
}

@article{Flexer:2006,
	Author = {Flexer, Arthur},
	Doi = {doi:10.1080/09298210600834946},
	Journal = {Journal of New Music Research},
	Keywords = {ISMIR, statistics},
	Pages = {113-120(8)},
	Title = {Statistical evaluation of music information retrieval experiments},
	Url = {http://www.ingentaconnect.com/content/routledg/jnmr/2006/00000035/00000002/art00002},
	Volume = {35},
	Year = {June 2006},
	Abstract = {This work concerns the necessity of statistical evaluation of Music Information Retrieval (MIR) experiments. This necessity is motivated by applying fundamental notions of statistical hypotheses testing to MIR research. Minimum requirements concerning statistical evaluation are developed and the appropriate statistical techniques are introduced and exemplified in a genre classification context. Articles from the MIR literature are examined and criticized for the lack of statistical evaluation they contain.},
}

@article{Levine:2006,
	Author = {Levine, William},
	Doi = {doi:10.1080/09298210600835042},
	Journal = {Journal of New Music Research},
	Month = {June},
	Pages = {175-182(8)},
	Title = {Timbre priming effects and expectation in melody},
	Url = {http://www.ingentaconnect.com/content/routledg/jnmr/2006/00000035/00000002/art00005},
	Volume = {35},
	Year = {2006},
	Abstract = {In this study, participants identified the timbre of pitches when they occurred in isolation, and again when they occurred appended to short melodies. For pitches congruent with the melody, timbre identification generally improved when the pitches were appended to the melody in comparison to when they occurred in isolation. In addition, the amount of improvement was broadly consistent with theoretical accounts of the degree to which the pitches were expected, given the preceding melody. This finding relates both to proposed interactions in processing between pitch and timbre, and to theoretical work regarding melodic expectations. It suggests that melodic expectations can be revealed implicitly, and is consistent with the idea that they operate at a relatively early stage of perceptual processing. In this study, priming effects were shown in listeners without musical training, demonstrating that expectations can develop in response to passive exposure to music, not only in response to formal training.},
}

@electronic{Lederer:2005,
	Author = {Lederer, K.},
	Keywords = {phonetics, beatbox, beatboxing},
	Title = {The Phonetics of Beatboxing (undergraduate degree dissertation)},
	Url = {http://www.humanbeatbox.com/phonetics},
	Urldate = {13th Feb 2008},
	Year = {2005},
	Abstract = {Human beatboxing involves an extensive grammar of speech and non-speech sounds created by a range of articulatory methods on a number of different airstream mechanisms. The study is concerned with the production and perception of beatboxing from a phonetic point of view and it looks at the general phenomenon of how beatboxing is integrated with speech without intelligibility being lost. In order to consider beatboxing in relation to the production and perception of speech, three beatboxed sounds are studied in detail. The study looks at how they are created and how their articulation contributes to or limits the accuracy with which they imitate drum machine sounds. It was found that beatboxing sounds resemble electronic ones very closely yet still retain some characteristics of speech sounds so that they may ultimately be interpreted as either speech or non-speech or both.

The study draws on phonetics and psychoacoustics to discuss how audiences are fooled into thinking that they hear several different instruments all coming from the mouth of a single person.},
}

@inbook{Nattiez:2008,
	Author = {Nattiez, J-J.},
	Chapter = {Inuit vocal games},
	Keywords = {inuit, throat-singing, Katajjaq},
	Publisher = {Historica Foundation},
	Title = {The Canadian Encyclopedia},
	Url = {http://www.thecanadianencyclopedia.com/index.cfm?PgNm=TCE&Params=U1ARTU0001711},
	Year = {2008},
}

@manual{Fukui:2003,
	Author = {Fukui, R.},
	Keywords = {IPA, phonetic, alphabet},
	Title = {{{TIPA} Manual}},
	Url = {http://www.ctan.org/tex-archive/fonts/tipa/tipaman.pdf},
	Year = {2003},
}

@manual{Shure:2006,
	Author = {{Shure Inc.}},
	Keywords = {microphone},
	Title = {Shure {SM58} user guide},
	Url = {http://www.shure.com/stellent/groups/public/@gms_gmi_web_ug/documents/web_resource/us_pro_sm58_ug.pdf},
	Year = {2006},
}

@book{IPA:1999,
	Author = {{International Phonetic Association}},
	Keywords = {IPA,phonetics, alphabet},
	Publisher = {Cambridge University Press},
	Title = {Handbook of the International Phonetic Association: a guide to the use of the international phonetic alphabet},
	Year = {1999}}

@book{Mabry:2002,
	Author = {Mabry, S.},
	Publisher = {Oxford University Press, USA},
	Title = {Exploring Twentieth-Century Vocal Music: A Practical Guide to Innovations in Performance and Repertoire},
	Year = {2002}}

@book{Kohonen:2001,
	Author = {Kohonen, T.},
	Publisher = {Springer},
	Title = {{S}elf-{O}rganizing {M}aps},
	Year = {2001},
}

@book{Hastie:2001,
	Author = {Hastie, T. and Tibshirani, R. and Friedman, J.},
	Keywords = {self-organizing map, SOM},
	Publisher = {Springer},
	Series = {Springer Series in Statistics},
	Title = {The Elements of Statistical Learning: Data Mining, Inference, and Prediction},
	Year = {2001},
}

@inbook{Harvey:1998,
	Author = {Harvey, J.},
	Chapter = {So You Want to Use a Likert Scale?},
	Editor = {Harvey, J.},
	Publisher = {Learning Technology Dissemination Initiative},
	Title = {Evaluation Cookbook},
	Url = {http://www.icbl.hw.ac.uk/ltdi/cookbook/info_likert_scale/index.html},
	Year = {1998},
}

@inproceedings{Stowell:2008,
	Author = {Stowell, D. and Plumbley, M. D. and Bryan-Kinns, N.},
	Booktitle = {{N}ew {I}nterfaces for {M}usical {E}xpression},
	Keywords = {Evaluation, qualitative methods, discourse analysis, voice, timbre, beatboxing},
	Title = {Discourse analysis evaluation method for expressive musical interfaces},
	Url = {http://www.elec.qmul.ac.uk/digitalmusic/papers/2008/StowellPlumbley08-nime.pdf},
	Year = {2008},
	Abstract = {In the NIME field there is an acknowledged paucity of reliable evaluation. Structured evaluation methods do exist, derived from other areas of HCI, but they largely focus on how precisely users can reproduce musical units. To evaluate the expressive and creative affordances of an interface, we need to go beyond precision; but these aspects are difficult to operationalise, particularly with quantitative methods. However, rigorous qualitative methods do exist and can be used to investigate such topics. We present a methodology based around user studies involving Discourse Analysis of speech. We also present an example of the methodology in use: we evaluate a musical interface which utilises vocal timbre, with a user group of beatboxers.},
}

@book{Preparata:1985,
	Author = {Preparata, F. P. and Shamos, M. I.},
	Publisher = {Springer-Verlag},
	Title = {Computational Geometry: An Introduction},
	Year = {1985}}

@book{Patton:2002,
	Author = {Patton, M. Q.},
	Keywords = {focus groups},
	Publisher = {Sage Publications},
	Title = {Qualitative research and evaluation methods},
	Year = {2002},
}

@book{Stewart:2007a,
	Author = {Stewart, D. W.},
	Publisher = {SAGE Publications},
	Title = {Focus groups: Theory and practice },
	Year = {2007}}

@book{Kitzinger:1999,
	Author = {Kitzinger, J.},
	Publisher = {SAGE Publications},
	Title = {Developing focus group research : politics, theory, and practice},
	Year = {1999},
}

@url{Ma:2007,
	Author = {Ma, J. and Sun, Z.},
	Title = {Copula Component Analysis},
	Url = {http://arxiv.org/abs/cs.IR/0703095},
	Urldate = {7th Jan 2008},
	Year = {2007},
}

@article{Ekimov:2006,
	Author = {Ekimov, A. and Sabatier, J.M.},
	Doi = {10.1121/1.2217371},
	Journal = {The Journal of the Acoustical Society of America},
	Pages = {762},
	Publisher = {ASA},
	Title = {Vibration and sound signatures of human footsteps in buildings},
	Volume = {120},
	Year = {2006},
}

@article{Guo:2007,
	Author = {Guo, W. and Zhang, L. and Xia, B.},
	Journal = {Acoustics, Speech and Signal Processing, 2007. ICASSP 2007. IEEE International Conference on},
	Pages = {793--796},
	Title = {An Auditory Neural Feature Extraction Method for Robust Speech Recognition},
	Volume = {4},
	Year = {2007},
}

@article{Arora:2007,
	Author = {Arora, R. and Sethares, W. A.},
	Doi = {10.1109/TSP.2007.896296},
	Journal = {IEEE Transactions on Signal Processing},
	Number = {9},
	Pages = {4382--4392},
	Title = {Adaptive Wavetable Oscillators},
	Url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4291842},
	Volume = {55},
	Year = {2007},
	Abstract = {An adaptive oscillator is a system that can lock on to a time-varying input signal, synchronizing its output to both the frequency and phase of the input. A wavetable oscillator generates a periodic output by indexing into a lookup table that stores a single period of the waveform. An adaptive wavetable oscillator (AWO) combines these two ideas in a technique which separates the periodic output waveform from the parameters that control the adaptation of the frequency and phase of the waveform. This separation is advantageous because it decouples the state of the oscillator from the dynamics of the adaptation, allowing the process of synchronization to be interpreted as a simple gradient optimization on a cost function. The oscillations remain stable over a large and easily described range of parameter values, and analysis of the synchronization can proceed along lines familiar from standard adaptive systems. Key issues in the design of AWOs are: the class of admissible inputs, the shape of the wavetable, the parameters that will be controlled, and the adaptive algorithm that adjusts the parameters. This paper examines these issues through analysis and simulation, focusing on conditions that achieve the desired synchronization between output and input.},
}

@article{Clavel:2007,
	Author = {Clavel, C. and Devillers, L. and Richard, G. and Vasilescu, I. and Ehrette, T.},
	Journal = {International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
	Pages = {21--24},
	Title = {Detection and Analysis of Abnormal Situations Through Fear-Type Acoustic Manifestations},
	Volume = {4},
	Year = {2007},
}

@inproceedings{Lugger:2007,
	Author = {Lugger, M. and Yang, B.},
	Booktitle = {International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
	Pages = {17--20},
	Title = {The Relevance of Voice Quality Features in Speaker Independent Emotion Recognition},
	Volume = {4},
	Year = {2007}}

@book{Johnstone:2008,
	Author = {Johnstone, B.},
	Edition = {2nd},
	Publisher = {Blackwell},
	Title = {Discourse Analysis},
	Year = {2008},
}

@book{Wetherell:2001,
	Author = {Wetherell, M. and Yates, S. and Taylor, S.},
	Keywords = {discourse analysis},
	Publisher = {Sage Pubns},
	Title = {Discourse as Data: A Guide for Analysis},
	Year = {2001},
}

@inbook{Uszkoreit:1996,
	Author = {Uszkoreit, H.},
	Chapter = {6 (Discourse and Dialogue)},
	Editor = {Cole, R. A. and Mariani, J. and Uszkoreit, H. and Zaenen, A. and Zue, V.},
	Keywords = {discourse analysis},
	Publisher = {Center for Spoken Language Understanding, Oregon Health and Science University},
	Title = {Survey of the State of the Art in Human Language Technology},
	Url = {http://cslu.cse.ogi.edu/HLTsurvey/ch6node2.html},
	Year = {1996},
}

@article{Antaki:2004,
	Author = {Antaki, C. and Billig, M. and Edwards, D. and Potter, J.},
	Journal = {Discourse Analysis Online},
	Keywords = {discourse analysis, qualitative methods, research methodology},
	Title = {Discourse Analysis Means Doing Analysis: A Critique Of Six Analytic Shortcomings},
	Url = {http://extra.shu.ac.uk/daol/articles/v1/n1/a1/antaki2002002-paper.html},
	Year = {2004},
	Abstract = {A number of ways of treating talk and textual data are identified which fall short of discourse analysis. They are: (1) under-analysis through summary; (2) under-analysis through taking sides; (3) under-analysis through over-quotation or through isolated quotation; (4) the circular identification of discourses and mental constructs; (5) false survey; and (6) analysis that consists in simply spotting features. We show, by applying each of these to an extract from a recorded interview, that none of them actually analyse the data. We hope that illustrating shortcomings in this way will encourage further development of rigorous discourse analysis in social psychology.},
}

@article{Vurma:2007,
	Author = {Vurma, A. and Ross, J.},
	Journal = {journal of interdisciplinary music studies},
	Keywords = {Pitch, timbre, intonation, tuning},
	Number = {1},
	Pages = {33--50},
	Title = {Timbre-Induced Pitch Deviations of Musical Sounds},
	Url = {http://www.musicstudies.org/first%20issue/FULL/Timbre_Induced_VURMA&ROSS(33-50).pdf},
	Volume = {1},
	Year = {2007},
	Abstract = {This article deals with timbre-induced pitch deviations and their magnitude in environments designed to resemble those that performing musicians encounter in their daily practice. Two experiments were conducted. In the first experiment classically trained singers matched the pitch of synthe-sized sounds of the piano and oboe. The fun-damental frequency of vocal sounds was on average 7 to 13 cents lower than the fundamental frequency of the instrumental sounds. The difference was more pronounced in the case of the piano timbre. In the second ex-periment, participants compared sounds produced by a single performer from the pitch-matching task of the first experiment to synthesized piano and oboe sounds. A three-alter-native forced choice task was used, where participants judged whether the instrumental sounds were higher, lower or equal in pitch when compared to the vocal sounds. Results showed that the highest number of in-tune ratings was elicited if the vocal sounds were performed at about 20 cents below the funda-mental frequency of the instrumental sounds. The difference between fundamental frequencies of the sounds perceived as equal in pitch may be explained by different energy distribu-tions in their power spectra.
},
}

@article{Garnier:2007,
	Author = {Garnier, M. and Henrich, N. and Castellengo, M. and Sotiropoulos, D. and Dubois, D.},
	Journal = {Journal of Interdisciplinary Music Studies},
	Keywords = {Voice quality, perception, cognitive representation, semantics, timbre, singing, verbal description, acoustic description},
	Number = {2},
	Pages = {62--91},
	Title = {Characterisation of Voice Quality in Western Lyrical Singing: from Teachers' Judgements to Acoustic Descriptions},
	Url = {http://www.musicstudies.org/VoiceQuality_JIMS_071204.pdf},
	Volume = {1},
	Year = {2007},
	Abstract = {This pilot study aims at defining the notion of voice quality in Western lyrical singing and seeks significant and objective criteria to characterise it, from both cognitive and acoustic points of view. We have chosen an approach based on the semantic analysis of experts' discourses, as we assume that the description of the acoustic signal itself cannot fully account for the characterisation of voice quality and that this notion rather relies on the listeners' cognitive representations, which allow them to elaborate a meaningful judgement from acoustic properties.

    Therefore we started this study with a listening test, conducted with 11 singing teachers who freely described the voice quality of 18 western lyrical extracts, recorded from three different male singers. The linguistic analysis of these verbalisations brought information about the specific lexical resources (in French) involved in discourses, from which inferences can be made regarding the different conceptions of voice quality and the listening modes of the lyrical expert field. For these listeners, voice quality appeared to be a series of cues allowing to identify sound production, the singer's identity and his emotional state rather than just a set of acoustic characteristics perceived and processed for themselves.

    Next, an acoustic database was recorded with three lyrical male singers, who produced different voice qualities illustrating graduations of some verbal descriptors. These terms  were selected for their relevance and  from the previous linguistic analyses. The acoustic analyses of this database allowed us to put forward different sound descriptors which may be cues for listeners to perceive some aspects of voice quality, and which may account for the semantic overlap of several criteria for evaluating voice quality.
},
}

@conference{Stowell:2007c,
	Address = {London},
	Author = {Stowell, D. and Bryan-Kinns, N. and Plumbley, M. D.},
	Booktitle = {Digital Music Research Network Plus Two},
	Month = {December},
	Organization = {Queen Mary, University of London},
	Title = {Evaluating musical interfaces: Beyond precision? (Poster)},
	Year = {2007},
	Abstract = {In the field of new musical instruments/interfaces there is an acknowledged paucity of evaluation (Poupyrev et al 2001). Some work on evaluation methodology does exist, taking its cue from evaluation methods in HCI (Wanderley {\&} Orio 2002), but this evaluates a system in terms of the accuracy in reproduction of musical units such as glissandi or arpeggios. For expressive or improvised performance, precision is not enough: a performer desires an interface that is in some sense intuitive and offers sufficient freedom of expression (Magnusson {\&} Mendieta 2007). Accuracy can be operationalised numerically using HCI methods, but these other desirables cannot easily be condensed to numerical measures.

We present an experimental design which aims to contribute towards the evaluation of musical interfaces by assessing users' experiences more broadly than precision of reproduction. We use qualitative methods to analyse user experience, namely Discourse Analysis performed on semi-structured interviews. We also use a guided exploration technique to allow the user to explore the system in a relatively short period of time.


}}

@mastersthesis{Shannon:2002,
	Author = {Shannon, T.},
	Keywords = {beatboxing},
	School = {Parsons Design School},
	Title = {Catch the Beat: The Transformation from African Drumbeats to Hip-hop Synthesized Basslines},
	Url = {http://a.parsons.edu/~tawanda/thesis/revisedresearch.pdf},
	Year = {2002},
}

@inproceedings{Yao:1999,
	Author = {Yao, Y. Y. and Wong, S. K. M. and Butz, C. J.},
	Booktitle = {Pacific-Asia Conference on Knowledge Discovery and Data Mining},
	Pages = {133--137},
	Title = {On Information-Theoretic Measures of Attribute Importance},
	Url = {http://www2.cs.uregina.ca/~butz/publications/pakdd99.pdf},
	Year = {1999},
	Abstract = {An attribute is deemed important in data mining if it partitions the database such that previously unknown regularities are observable. Many information-theoretic measures have been applied to quantify the importance of an attribute. In this paper, we summarize and critically analyze these measures. 
},
}

@book{Banister:1994,
	Address = {Buckingham},
	Author = {Banister, P. and Burman, E. and Parker, I. and Taylor, M. and Tindall, C.},
	Isbn = {978-0335191819},
	Keywords = {discourse analysis},
	Publisher = {Open University Press},
	Title = {Qualitative Methods in Psychology: A Research Guide},
	Year = {1994},
}

@article{Xu:2007,
	Author = {Xu, W. and Chang, C. and Hung, Y. S. and Kwan, S. K. and Fung, P. C. W.},
	Doi = {10.1109/TSP.2007.899374},
	Issn = {1053-587X},
	Journal = {IEEE Transactions on Signal Processing},
	Keywords = {OSCC},
	Month = {December},
	Number = {12},
	Pages = {5552--5563},
	Title = {Order Statistics Correlation Coefficient as a Novel Association Measurement With Applications to Biosignal Analysis},
	Volume = {55},
	Year = {2007},
	Abstract = {In this paper, we propose a novel correlation coefficient based on order statistics and rearrangement inequality. The proposed coefficient represents a compromise between the Pearson's linear coefficient and the two rank-based coefficients, namely Spearman's rho and Kendall's tau. Theoretical derivations show that our coefficient possesses the same basic properties as the three classical coefficients. Experimental studies based on four models and six biosignals show that our coefficient performs better than the two rank-based coefficients when measuring linear associations; whereas it is well able to detect monotone nonlinear associations like the two rank-based coefficients. Extensive statistical analyses also suggest that our new coefficient has superior anti-noise robustness, small biasedness, high sensitivity to changes in association, accurate time-delay detection ability, fast computational speed, and robustness under monotone nonlinear transformations.
},
}

@article{Martin:1993,
	Author = {Martin, P.P.},
	Doi = {10.1142/S0218127493000623},
	Journal = {International Journal of Bifurcation and Chaos},
	Month = {June},
	Number = {3},
	Pages = {717--727},
	Title = {STATISTICAL MECHANICS OF BIOLOGICAL AND OTHER COMPLEX EXPERIMENTAL TIME SERIES: ASSESSING GEOMETRICAL AND DYNAMICAL PROPERTIES},
	Volume = {3},
	Year = {1993},
	Abstract = {Biological and other experimental time series often exhibit complex and possibly chaotic behavior that may not be completely deterministic or completely random. Particularly problematic is the fact that measures of chaos such as the dynamical or geometrical invariants, e.g. the correlation dimension, Lyapunov exponents, or Kolmogorov entropy, often cannot be calculated from short, noisy, and possibly highly discretized experimental time series. Here, it is argued that nonrandom structure in the data may be uncovered by using a conceptual framework based on statistical mechanics and the standard correlation integral as a computational tool. A new use of the generalized correlation integral is proposed to assess statistically the occurrence of nonrandom spatiotemporal patterns in experimental data. We argue that nonrandomness of a time series can be assessed by the statistics of the topology of the reconstructed state space distribution, which we quantify via the generalized correlation integral. This approach provides a simple, graphical tool which can yield immediate information about the length scales and sequence lengths where the data may appear to be different from random, and also may provide a data classification tool based on spatiotemporal patterns. We demonstrate the usefulness of this approach using several numerical examples, including data from experimental biological systems. Finally, we propose that particular characteristics of such patterns imply considerable macroscopic information about the behavior of the generating system, and qualitative changes in the time series.},
}

@article{Reuter:1999,
	Author = {Reuter, R. and Herzel, H. and Orglmeister, R.},
	Doi = {10.1142/S0218127499000742},
	Journal = {International Journal of Bifurcation and Chaos},
	Month = {June},
	Number = {6},
	Pages = {1075--1088},
	Title = {Simulations of vocal fold vibrations with an analog circuit},
	Volume = {9},
	Year = {1999},
	Abstract = {The human voice source generates complex signals including subharmonics and toroidal oscillations. Essential features of voice production are covered by two-mass models where each vocal fold is represented by two oscillators. We present a related analog circuit which allows online analysis of the generated complex signals. Using narrow-band spectrograms various bifurcations due to left--right asymmetry are monitored. Time series, spectra, return maps, and response measurements provide further detailed information about phase-locking and toroidal oscillations. Finally, possible clinical applications are discussed.
},
}

@article{Beirlant:1997,
	Author = {Beirlant, J. and Dudewicz, E. J. and Gyorfi, L. and van der Meulen, E. C.},
	Journal = {International Journal of Mathematical and Statistical Sciences},
	Keywords = {entropy},
	Pages = {17--39},
	Title = {Nonparametric entropy estimation: An overview},
	Url = {http://ecf.caltech.edu/summerlecture/docs/Entropy%20estimation.pdf},
	Volume = {6},
	Year = {1997},
	Abstract = {An overview is given of the several methods in use for the nonparametric estimation of the differential entropy of a continuous random variable. The properties of various methods are compared. Several applications are given such as tests for goodness-of-fit, parameter estimation, quantization theory and spectral estimation. 
},
}

@article{Darbellay:1999,
	Author = {Darbellay, G. A. and Vajda, I.},
	Doi = {10.1109/18.761290},
	Journal = {IEEE Transactions on Information Theory},
	Keywords = {entropy, mutual information},
	Number = {4},
	Pages = {1315--1321},
	Title = {Estimation of the information by an adaptive partitioning of the observation space},
	Volume = {45},
	Year = {1999},
	Abstract = {We demonstrate that it is possible to approximate the mutual information arbitrarily closely in probability by calculating the relative frequencies on appropriate partitions and achieving conditional independence on the rectangles of which the partitions are made. Empirical results, including a comparison with maximum-likelihood estimators, are presented},
}

@inproceedings{Maragos:1991,
	Author = {Maragos, P.},
	Booktitle = {Acoustics, Speech, and Signal Processing, 1991. ICASSP-91., 1991 International Conference on},
	Pages = {417--420},
	Title = {Fractal aspects of speech signals: dimension and interpolation},
	Year = {1991},
	Abstract = {The nonlinear dynamics of air flow during speech production may often result in some small or large degree of turbulence. The author quantifies the geometry of speech turbulence, as reflected in the fragmentation of the time signal, by using fractal models. He describes an efficient algorithm for estimating the short-time fractal dimension of speech segmentation and sound classification. He also develops a method for fractal speech interpolation which can be used to synthesize controlled amounts of turbulence in speech or to increase its sampling rate by preserving not its bandwidth (as is classically done) but rather its fractal dimension},
}

@inproceedings{Stewart:2007,
	Address = {Bordeaux, France},
	Author = {Stewart, R. and Sandler, M.},
	Booktitle = {Proc. of the 10th Int. Conference on Digital Audio Effects (DAFx-07)},
	Month = {September},
	Title = {STATISTICAL MEASURES OF EARLY REFLECTIONS OF ROOM IMPULSE RESPONSES},
	Url = {http://dafx.labri.fr/papers/p059.pdf},
	Year = {2007},
	Abstract = {An impulse response of an enclosed reverberant space is composed of three basic components: the direct sound, early reflections and late reverberation. While the direct sound is a single event that can be easily identified, the division between the early reflections and late reverberation is less obvious as there is a gradual transition between the two. 

This paper explores two statistical measures that can aid in determining a point in time where the early reflections have transitioned into late reverberation. These metrics exploit the similarities between late reverberation and Gaussian noise that are not commonly found in early reflections. Unlike other measures, these need no prior knowledge about the rooms such as geometry or volume. 
},
}

@inproceedings{Klapuri:2007,
	Author = {Klapuri, A.},
	Booktitle = {Acoustics, Speech and Signal Processing, 2007. ICASSP 2007. IEEE International Conference on},
	Keywords = {harmonic cepstrum},
	Title = {Analysis of Musical Instrument Sounds by Source-Filter-Decay Model},
	Volume = {1},
	Year = {2007},
	Abstract = {This paper proposes a way of modelling the time-varying spectral energy distribution of musical instrument sounds. The model consists of an excitation signal, a body response filter, and a loss filter which implements a frequency-dependent decay. The three parts are further represented with a linear model which allows controlling the number of parameters involved. A method is proposed for estimating all the model parameters jointly, taking into account additive noise. The method is evaluated by measuring its accuracy in representing 33 musical instruments and by testing its usefulness in extracting the melodic line of one instrument from a polyphonic audio signal.
},
}

@inproceedings{Klapuri:2001,
	Author = {Klapuri, A.},
	Booktitle = {Acoustics, Speech, and Signal Processing, 2001. Proceedings.(ICASSP'01). 2001 IEEE International Conference on},
	Doi = {10.1109/ICASSP.2001.940384},
	Title = {Multipitch estimation and sound separation by the spectral smoothness principle},
	Volume = {5},
	Year = {2001},
	Abstract = {A processing principle is proposed for finding the pitches and separating the spectra of concurrent musical sounds. The principle, spectral smoothness, is used in the human auditory system which separates sounds partly by assuming that the spectral envelopes of real sounds are continuous. Both theoretical and experimental evidence is presented for the vital importance of spectral smoothness in resolving sound mixtures. Three algorithms of varying complexity are described which successfully implement the new principle. In validation experiments, random pitch and sound source combinations were analyzed in a single time frame. The number of simultaneous sounds ranged from one to six, the database comprising sung vowels and 26 musical instruments. Usage of a specific yet straightforward smoothing operation corrected approximately half of the pitch errors that occurred in a system which was otherwise identical but did not use the smoothness principle. In random four-voice mixtures, pitch error rate reduced from 18 % to 8.1 %},
}

@article{Yang:2000,
	Author = {Yang, H.H. and Hermansky, H.},
	Journal = {Advances in Neural Information Processing Systems},
	Pages = {803--812},
	Title = {Search for Information Bearing Components in Speech},
	Url = {http://www.cs.cmu.edu/Groups/NIPS/NIPS99/99papers-pub-on-web/Named/YangHermansky.ps},
	Volume = {9},
	Year = {2000},
	Abstract = {The acoustic-modeling problem in automatic speech recognition is examined with the goal of unifying discrete and continuous parameter approaches. To model a sequence of information-bearing acoustic feature vectors which has been extracted from the speech waveform via some appropriate front-end signal processing, a speech recognizer basically faces two alternatives: (1) assign a multivariate probability distribution directly to the stream of vectors, or (2) use a time-synchronous labeling acoustic processor to perform vector quantization on this stream, and assign a multinomial probability distribution to the output of the vector quantizer. With a few exceptions, these two methods have traditionally been given separate treatment. A class of very general hidden Markov models which can accommodate feature vector sequences lying either in a discrete or in a continuous space is considered; the new class allows one to represent the prototypes in an assumption-limited, yet convenient way, as tied mixtures of simple multivariate densities. Speech recognition experiments, reported for two (5000- and 20000-word vocabulary) office correspondence tasks, demonstrate some of the benefits associated with this technique},
}

@article{Treves:1995,
	Author = {Treves, A. and Panzeri, S.},
	Journal = {Neural Computation},
	Number = {2},
	Pages = {399--407},
	Publisher = {MIT Press Cambridge, MA, USA},
	Title = {The upward bias in measures of information derived from limited data samples},
	Volume = {7},
	Year = {1995}}

@electronic{Srinivasa:2007,
	Author = {Srinivasa, S.},
	Keywords = {mutual information},
	Title = {A review on Multivariate Mutual Information},
	Url = {http://www.nd.edu/~jnl/ee80653/tutorials/sunil.pdf},
	Urldate = {29th Oct 2007},
	Year = {2007},
}

@article{Ellis:2000,
	Author = {Ellis, D.P.W. and Bilmes, J.A.},
	Journal = {Int. Conf. on Spoken Language Processing},
	Pages = {79--82},
	Title = {Using mutual information to design feature combinations},
	Url = {ftp://ftp.icsi.berkeley.edu/pub/speech/papers/icslp00-cmi.pdf},
	Volume = {3},
	Year = {2000},
	Abstract = {Combination of different feature streams is a well-established method for improving speech recognition performance. This empirical success, however, poses theoretical problems when trying to design combination systems: is it possible to predict which feature streams will combine most advantageously, and which of the many possible combination strategies will be most successful for the particular feature streams in question? We approach these questions with the tool of conditional mutual information (CMI), estimating the amount of information that one feature stream contains about the other, given knowledge of the correct subword unit label. We argue that CMI of the raw feature streams should be useful in deciding whether to merge them together as one large stream, or to feed them separately into independent classifiers for later combination; this is only weakly supported by our results. We also argue that CMI between the outputs of independent classifiers based on each stream should help predict which streams can be combined most beneficially. Our results confirm the usefulness of this measure. 
},
}

@article{Tourassi:2001,
	Author = {Tourassi, G.D. and Frederick, E.D. and Markey, M.K. and Floyd Jr, C.E.},
	Doi = {10.1118/1.1418724},
	Journal = {Medical Physics},
	Pages = {2394--2402},
	Title = {Application of the mutual information criterion for feature selection in computer-aided diagnosis},
	Url = {http://web.cse.msu.edu/~cse902/S03/mi_features.pdf},
	Volume = {28},
	Year = {2001},
	Abstract = {The purpose of this study was to investigate an information theoretic approach to feature selection for computer-aided diagnosis (CAD). The approach is based on the mutual information (MI) concept. MI measures the general dependence of random variables without making any assumptions about the nature of their underlying relationships. Consequently, MI can potentially offer some advantages over feature selection techniques that focus only on the linear relationships of variables. This study was based on a database of statistical texture features extracted from perfusion lung scans. The ultimate goal was to select the optimal subset of features for the computer-aided diagnosis of acute pulmonary embolism (PE). Initially, the study addressed issues regarding the approximation of MI in a limited dataset as it is often the case in CAD applications. The MI selected features were compared to those features selected using stepwise linear discriminant analysis and genetic algorithms for the same PE database. Linear and nonlinear decision models were implemented to merge the selected features into a final diagnosis. Results showed that the MI is an effective feature selection criterion for nonlinear CAD models overcoming some of the well-known limitations and computational complexities of other popular feature selection techniques in the field.},
}

@article{Caclin:2007,
	Author = {Caclin, A. and Giard, M.-H. and Smith, B. K. and McAdams, S.},
	Doi = {doi:10.1016/j.brainres.2006.12.065},
	Journal = {Brain Research},
	Keywords = {Audition; Timbre; Dimensional interaction; Garner interference; Reaction time},
	Pages = {159--170},
	Title = {Interactive processing of timbre dimensions: A Garner interference study},
	Url = {http://www.sciencedirect.com/science/article/B6SYR-4MP56B6-3/2/d621cb726ab3b5ade33014287632461c},
	Volume = {1138},
	Year = {2007},
	Abstract = { Timbre characterizes the identity of a sound source. Psychoacoustic studies have revealed that timbre is a multidimensional perceptual attribute with multiple underlying acoustic dimensions of both temporal and spectral types. Here we investigated the relations among the processing of three major timbre dimensions characterized acoustically by attack time, spectral centroid, and spectrum fine structure. All three pairs of these dimensions exhibited Garner interference: speeded categorization along one timbre dimension was affected by task-irrelevant variations along another timbre dimension. We also observed congruency effects: certain pairings of values along two different dimensions were categorized more rapidly than others. The exact profile of interactions varied across the three pairs of dimensions tested. The results are interpreted within the frame of a model postulating separate channels of processing for auditory attributes (pitch, loudness, timbre dimensions, etc.) with crosstalk between channels.},
}

@article{Chan:1995,
	Author = {Chan, M. V. and Heinen, J. A. and Niederjohn, R. J.},
	Doi = {10.1109/78.365313},
	Journal = {Signal Processing, IEEE Transactions on [see also Acoustics, Speech, and Signal Processing, IEEE Transactions on]},
	Number = {1},
	Pages = {308--310},
	Title = {Formulas for the impulse response of a digital filter with an arbitrary piecewise-linear frequency response},
	Volume = {43},
	Year = {1995},
	Abstract = {Simple formulas are derived for the impulse response of a digital filter with an arbitrary piecewise-linear frequency response. These are generalizations of a formula given in Oppenheim and Schafer (1989), and are useful in designing linear-phase FIR filters by the ``windowing'' method},
}

@article{Togneri:1992,
	Author = {Togneri, R. and Alder, M. D. and Attikiouzel, Y.},
	Isbn = {0956-3776},
	Journal = {Communications, Speech and Vision, IEE Proceedings I},
	Journal1 = {Communications, Speech and Vision, IEE Proceedings I},
	Keywords = {encoding; filtering and prediction theory; speech analysis and processing; LPC coefficients; dimension; filter bank space; four-dimensional manifold; speech space; structure},
	Number = {2},
	Pages = {123--127},
	Title = {Dimension and structure of the speech space},
	Volume = {139},
	Year = {1992},
	Abstract = {Presents evidence to support the claim that the space of trajectories of speech exists and may be approximated by a four-dimensional manifold which is nonlinearly embedded in both a space of LPC coefficients and also in a filter bank space. The authors also investigate the possibility that there are different dimensions for different phonetic categories, but find no evidence to support this hypothesis. The dimension is of interest since it is the smallest number of independent parameters needed to specify speech},
}

@article{Alder:1991,
	Author = {Alder, MD and Togneri, R. and Attikiouzel, Y.},
	Journal = {Communications, Speech and Vision, IEE Proceedings I},
	Number = {3},
	Pages = {207--214},
	Title = {Dimension of the speech space},
	Volume = {138},
	Year = {1991},
	Abstract = {Defines a statistic for estimating the intrinsic dimension of a finite set of points on the assumption that they lie on a smoothly embedded manifold, when of course, the dimension is an integer. The authors test the method on finite sets drawn from known manifolds and show that it is robust. They also apply it to the Lorenz attractor. Finally they apply it to speech data of the type used by Tattersal et al (1983). It is concluded that the speech space is not discernibly a low-dimensional manifold at all, and that a more plausible hypothesis is that the space is an open subset of the enclosing space. A measure is constructed of the extent to which the surface that the Kohonen algorithm fits to the speech space is buckled or crinkled related to the mean absolute curvature. The speech space can be approximated with a low-dimensional manifold, but it has dimension greater than two.
},
}

@inproceedings{Roy:2007,
	Author = {Roy, P. and Pachet, F. and Krakowski, S.},
	Booktitle = {Proceedings of 10th International Conference on Digital Audio Effects (DAFx-07)},
	Keywords = {genetic programming},
	Month = {September},
	Title = {Analytical Features for the Classification of Percussive Sounds: The case of the Pandeiro},
	Url = {http://www.csl.sony.fr/downloads/papers/2007/roy-07b.pdf},
	Year = {2007},
	Abstract = {There is an increasing need for automatically classifying sounds for MIR and interactive music applications. In the context of supervised classification, we describe an approach that improves the performance of the general bag-of-frame scheme without loosing its generality. This method is based on the construction and exploitation of specific audio features, called analytical, as input to classifiers. These features are better, in a sense we define precisely than standard, general features, or even than ad hoc features designed by hand for specific problems. To construct these features, our method explores a very large space of functions, by composing basic operators in syntactically correct ways. These operators are taken from the Mathematical and Audio Processing domains. Our method allows us to build a large number of these features, evaluate and select them automatically for arbitrary audio classification problems. 

We present here a specific study concerning the analysis of Pandeiro (Brazilian tambourine) sounds. Two problems are considered: the classification of entire sounds, for MIR applications, and the classification of attacks portions of the sound only, for interactive music applications. We evaluate precisely the gain obtained by analytical features on these two problems, in comparison with standard approaches. 
},
}

@article{Hoare:1961,
	Author = {Hoare, C. A. R.},
	Doi = {10.1145/366622.366642},
	Journal = {Communications of the ACM},
	Pages = {321--322},
	Title = {Algorithm 63 (partition) and algorithm 65 (find)},
	Volume = {4},
	Year = {1961},
}

@inproceedings{Beygelzimer:2006,
	Author = {Beygelzimer, A. and Kakade, S. and Langford, J.},
	Booktitle = {Proceedings of the 23rd international conference on Machine learning},
	Keywords = {embedding dimension, false nearest neighbours},
	Pages = {97--104},
	Publisher = {ACM Press New York, NY, USA},
	Title = {Cover trees for nearest neighbor},
	Url = {http://hunch.net/~jl/projects/cover_tree/icml_final/final-icml.pdf},
	Year = {2006},
}

@book{Kantz:2004,
	Author = {Kantz, H. and Schreiber, T.},
	Publisher = {Cambridge University Press},
	Series = {Cambridge Nonlinear Science Series},
	Title = {Nonlinear Time Series Analysis},
	Year = {2004}}

@article{Kennel:1992,
	Author = {Kennel, M. B. and Brown, R. and Abarbanel, H. D. I.},
	Doi = {10.1103/PhysRevA.45.3403},
	Journal = {Phys. Rev. A},
	Keywords = {embedding dimension, false nearest neighbours},
	Month = {Mar},
	Number = {6},
	Numpages = {8},
	Pages = {3403--3411},
	Publisher = {American Physical Society},
	Title = {Determining embedding dimension for phase-space reconstruction using a geometrical construction},
	Url = {http://prola.aps.org/pdf/PRA/v45/i6/p3403_1},
	Volume = {45},
	Year = {1992},
	Abstract = {We examine the issue of determining an acceptable minimum embedding dimension by looking at the behavior of near neighbors under changes in the embedding dimension from d→d+1. When the number of nearest neighbors arising through projection is zero in dimension dE, the attractor has been unfolded in this dimension. The precise determination of dE is clouded by ``noise,'' and we examine the manner in which noise changes the determination of dE. Our criterion also indicates the error one makes by choosing an embedding dimension smaller than dE. This knowledge may be useful in the practical analysis of observed time series.},
}

@article{Qing-Fang:2007,
	Author = {Qing-Fang, M. and Yu-Hua, P. and Pei-Jun, X.},
	Doi = {10.1088/1009-1963/16/5/014},
	Journal = {Chinese Physics},
	Keywords = {embedding dimension, nonlinear autoregressive prediction model, nonlinear time series},
	Month = {May},
	Number = {5},
	Pages = {1252--1257},
	Title = {A new method of determining the optimal embedding dimension based on nonlinear prediction},
	Volume = {16},
	Year = {2007},
	Abstract = {A new method is proposed to determine the optimal embedding dimension from a scalar time series in this paper. This method determines the optimal embedding dimension by optimizing the nonlinear autoregressive prediction model parameterized by the embedding dimension and the nonlinear degree. Simulation results show the effectiveness of this method. And this method is applicable to a short time series, stable to noise, computationally efficient, and without any purposely introduced parameters.
},
}

@inproceedings{Kaplan:1993,
	Author = {Kaplan, D.T.},
	Booktitle = {Proceedings of {SPIE}},
	Doi = {10.1117/12.162676},
	Journal = {Chaos in Communications},
	Pages = {236--240},
	Title = {A model-independent technique for determining the embedding dimension},
	Year = {1993},
	Abstract = {The method of lag-embedding, common in the analysis of signals in the context of nonlinear dynamics, requires the selection of an embedding dimension. This embedding dimension is analogous to the model order in a linear prediction model, but the order of a linear prediction model is of little use in characterizing chaotic signals or in indicating an appropriate embedding dimension for nonlinear analysis. Nonlinear prediction models, however, have been successfully used for this purpose. Here, we describe a technique for selecting an appropriate embedding dimension that is motivated by nonlinear prediction, but does not require the specification of the form of a prediction model.
},
}

@article{Cao:1997,
	Author = {Cao, L.},
	Doi = {10.1016/S0167-2789(97)00118-8},
	Journal = {Physica D},
	Keywords = {embedding dimension},
	Number = {1-2},
	Pages = {43--50},
	Publisher = {Elsevier Science Publishers BV Amsterdam, The Netherlands, The Netherlands},
	Title = {Practical method for determining the minimum embedding dimension of a scalar time series},
	Volume = {110},
	Year = {1997},
	Abstract = {A practical method is proposed to determine the minimum embedding dimension from a scalar time series. It has the following advantages: (1) does not contain any subjective parameters except for the time-delay for the embedding; (2) does not strongly depend on how many data points are available; (3) can clearly distinguish deterministic signals from stochastic signals; (4) works well for time series from high-dimensional attractors; (5) is computationally efficient. Several time series are tested to show the above advantages of the method.
},
}

@article{Chun-Hua:2004,
	Author = {Chun-Hua, B. and Xin-Bao, N.},
	Journal = {Chinese Physics},
	Keywords = {embedding dimension},
	Number = {5},
	Pages = {633--636},
	Title = {Determining the minimum embedding dimension of nonlinear time series based on prediction method},
	Volume = {13},
	Year = {2004},
}

@article{Pi:1994,
	Author = {Pi, H. and Peterson, C.},
	Journal = {Neural Computation},
	Keywords = {embedding dimension},
	Number = {3},
	Pages = {509--520},
	Title = {Finding the embedding dimension and variable dependencies in time series},
	Url = {http://www.thep.lu.se/ftp/pub/Preprints/93/lu_tp_93_04.ps.gz},
	Volume = {6},
	Year = {1994},
	Abstract = {We present a general method, the ffi-test, which establishes functional dependencies given a table of measurements. The approach is based on calculating conditional probabilities from vector component distances. Imposing the requirement of continuity of the underlying function the obtained values of the conditional probabilities carry information on the embedding dimension and variable dependencies. The power of the method is illustrated on synthetic time-series with different time-lag...
},
}

@inproceedings{Reiss:2003,
	Author = {Reiss, J. and Sandler, M. B.},
	Booktitle = {{DAFX}: Digital Audio Effects},
	Keywords = {fractal dimension},
	Title = {Nonlinear Time Series Analysis Of Musical Signals},
	Url = {http://www.elec.qmul.ac.uk/dafx03/proceedings/pdfs/dafx71.pdf},
	Year = {2003},
	Abstract = {In this work the techniques of chaotic time series analysis are applied to music. The audio stream from musical recordings are treated as representing experimental data from a dynamical system. Several performance of well-known classical pieces are analysed using recurrence analysis, stationarity measures, information metrics, and other time series based approaches. The benefits of such analysis are reported. 
},
}

@phdthesis{Reiss:2001,
	Author = {Reiss, J.},
	Keywords = {fractal dimension},
	Month = {May},
	School = {Georgia Institute of Technology},
	Title = {The analysis of chaotic time series},
	Url = {http://www.elec.qmul.ac.uk/people/josh/documents/Reiss-PhDThesis.pdf},
	Year = {2001},
	Abstract = {Chaotic time series analysis methods were applied to several experimental systems. Analysis of a Poincare section of magnetoelastic ribbon time series was used to construct symbolic dynamics and extract empirical quantities such as fractal dimension and Lyapunov exponents. In the pulse thermal combustion engine, analysis of several data sets was used to establish high dimensionality and complex dynamics. Data sets were also analyzed from an electric step motor. Low dimensional chaotic dynamics were observed and quantified. Each of these systems exhibited nonstationarity and other behaviors that made the analysis difficult and demonstrated flaws in established time series analysis techniques. Thus methods were devised to improve these techniques and synthesize them into a coherent package. Finally, a new design was proposed for a chaotic sigma delta modulator. Techniques from nonlinear dynamics and chaos theory were used to show that this modulator was stable and had desirable properties not exhibited in previously proposed designs.
},
}

@inproceedings{Rose:2007,
	Author = {Rose, R. and Momayyez, P.},
	Booktitle = {International Conference on Acoustics, Speech and Signal Processing (ICASSP 2007)},
	Doi = {10.1109/ICASSP.2007.366915},
	Pages = {325--328},
	Title = {Integration of Multiple Feature Sets for Reducing Ambiguity in {ASR}},
	Volume = {4},
	Year = {2007},
	Abstract = {The main goal of this paper is to investigate the feasibility of exploiting the invariance properties associated with articulatory based acoustic features to reduce ambiguity in ASR search. A multivalued phonological feature set defined by King and Taylor is used along with a time delay neural network implementation of phonological feature detectors to produce eight independent phonological feature streams (S. King and P. Taylor, 2000). Hidden Markov models (HMMs) defined over these phonological feature streams are combined with HMMs defined over spectral energy based mel frequency cepstrum coefficient (MFCC) acoustic features through a lattice re-scoring procedure. It is shown that significant improvements in phone recognition accuracy are obtained for this combined system relative to phone accuracy obtained for MFCC based HMMs alone. A study is also performed to analyze the effects of uncertainty in phonological feature detection},
}

@inproceedings{Xiong:2003,
	Author = {Xiong, Z. and Radhakrishnan, R. and Divakaran, A. and Huang, T. S.},
	Booktitle = {Proceedings of the International Conference on Multimedia and Expo (ICME'03)},
	Pages = {397--400},
	Title = {Comparing {MFCC} and {MPEG-7} audio features for feature extraction, maximum likelihood {HMM} and entropic prior {HMM} for sports audio classification},
	Volume = {3},
	Year = {2003},
	Abstract = {We present a comparison of 6 methods for classification of sports audio. For the feature extraction we have two choices: MPEG-7 audio features and Mel-scale Frequency Cepstrum Coefficients (MFCC). For the classification we also have two choices: Muximum Likelihood Hidden Markov Models(ML-HMM) and Entropic Prior HMM(EP-HMM). EP-HMM, in turn, have two variations: with and without trimming of the model parameters. We thus have 6 possible methods, each of which corresponds to a combination. Our results show that all the combinations achieve classification accuracy of around 90% with the best and the second best being MPEG-7 features with EP-HMM and MFCC with ML-HMM. 
},
}

@article{Wendt:2007,
	Author = {Wendt, H. and Abry, P.},
	Isbn = {1053-587X},
	Journal = {Signal Processing, IEEE Transactions on {$[$}see also Acoustics, Speech, and Signal Processing, IEEE Transactions on{$]$}},
	Journal1 = {Signal Processing, IEEE Transactions on {$[$}see also Acoustics, Speech, and Signal Processing, IEEE Transactions on{$]$}},
	Keywords = {Bootstrap; hypothesis test; multifractal analysis; wavelet leaders},
	Number = {10},
	Pages = {4811--4820},
	Title = {Multifractality Tests Using Bootstrapped Wavelet Leaders},
	Title1 = {Signal Processing, IEEE Transactions on {$[$}see also Acoustics, Speech, and Signal Processing, IEEE Transactions on{$]$}},
	Ty = {JOUR},
	Volume = {55},
	Year = {2007},
	Abstract = { Multifractal analysis, which mostly consists of measuring scaling exponents, is becoming a standard technique available in most empirical data analysis toolboxes. Making use of the most recent theoretical results, it is based here on the estimation of the cumulants of the log of the wavelet Leaders, an elaboration on the wavelet coefficients. These log-cumulants theoretically enable discrimination between mono- and multifractal processes, as well as between simple log-normal multifractal models and more advanced ones. The goal of the present contribution is to design nonparametric bootstrap hypothesis tests aiming at testing the nature of the multifractal properties of stochastic processes and empirical data. Bootstrap issues together with six declinations of test designs are analyzed. Their statistical performance (significances, powers, and p-values) are assessed and compared by means of Monte Carlo simulations performed on synthetic stochastic processes whose multifractal properties (and log-cumulants) are known theoretically a priori . We demonstrate that the joint use of wavelet Leaders, log-cumulants, and bootstrap procedures enable us to obtain a powerful tool for testing the multifractal properties of data. This tool is practically effective and can be applied to a single observation of data with finite length.},
}

@article{Weruaga:2007,
	Author = {Weruaga, L.},
	Isbn = {1053-587X},
	Journal = {Signal Processing, IEEE Transactions on {$[$}see also Acoustics, Speech, and Signal Processing, IEEE Transactions on{$]$}},
	Journal1 = {Signal Processing, IEEE Transactions on {$[$}see also Acoustics, Speech, and Signal Processing, IEEE Transactions on{$]$}},
	Keywords = {Autoregressive (AR) model; frequency domain; maximum-likelihood estimation; pole update},
	Number = {10},
	Pages = {4821--4830},
	Title = {All-Pole Estimation in Spectral Domain},
	Title1 = {Signal Processing, IEEE Transactions on {$[$}see also Acoustics, Speech, and Signal Processing, IEEE Transactions on{$]$}},
	Ty = {JOUR},
	Volume = {55},
	Year = {2007},
	Abstract = {<para> Autoregressive (AR) modeling is a popular spectral analysis method commonly resolved in the time domain. This paper presents a novel AR analysis framework dealing with the estimation of poles directly from spectral samples. The basis of the method lies on a minimizing functional built with a certain mapping of the spectral residue. The optimization mechanism is based on the multivariate Newton-Raphson algorithm. Two different mappings are considered, namely, linear and logarithmic. The linear case results in a nonquadratic convex functional, whose global minimum is equivalent to that of the time-domain autocorrelation method. The logarithmic case under the Maximum Likelihood criterion turns out equivalent to the Whittle likelihood, proven here to be suitable for frequency selective estimation. The statistical and convergence performance of the method is demonstrated with simulations on stochastic and deterministic harmonic signals. </para>},
}

@article{Cornu:2007,
	Author = {Cornu, C. and Stankovi¿, S. and Ioana, C. and Quinquis, A. and Stankovi¿, L.},
	Isbn = {1053-587X},
	Journal = {{IEEE} Transactions on Signal Processing},
	Journal1 = {Signal Processing, IEEE Transactions on {$[$}see also Acoustics, Speech, and Signal Processing, IEEE Transactions on{$]$}},
	Keywords = {Higher order signal analysis; instantaneous frequency; signal representations; time\{\&}{\#}x2013; frequency analysis},
	Number = {10},
	Pages = {4831--4838},
	Title = {Generalized Representation of Phase Derivatives for Regular Signals},
	Title1 = {Signal Processing, IEEE Transactions on {$[$}see also Acoustics, Speech, and Signal Processing, IEEE Transactions on{$]$}},
	Ty = {JOUR},
	Volume = {55},
	Year = {2007},
	Abstract = {This paper introduces a new generalized complex-lag moment which produces joint time-phase derivatives distributions. For the choice of the time-first-order phase derivative, which stands for timefrequency representation, this distribution can be seen as a form of the Wigner-Ville distribution. Moreover, this generalization leads to distributions with highly reduced inner interferences caused by the nonlinearity of the signal's phase. It can also be seen as a polynomial distribution since the Nth-order distribution produces no inner interferences for polynomial phase law of order N. Implementation of these distributions is addressed. The results are illustrated by examples.},
}

@article{Mahata:2007,
	Author = {Mahata, K. and Fu, M.},
	Isbn = {1053-587X},
	Journal = {{IEEE} Transactions on Signal Processing},
	Keywords = {Autoregressive moving average (ARMA) modeling; Nevanlinna\{\&}{\#}x2013; Pick interpolation; input-to-state filtering; spectral analysis},
	Number = {10},
	Pages = {4851--4861},
	Title = {A Robust Interpolation Algorithm for Spectral Analysis},
	Ty = {JOUR},
	Volume = {55},
	Year = {2007},
	Abstract = {<para> We propose a <emphasis emphasistype={\tt{}"{}}boldital{\tt{}"{}}>robust</emphasis> interpolation algorithm for model-based spectral analysis. Instead of estimating the spectral model directly, the so-called <emphasis emphasistype={\tt{}"{}}boldital{\tt{}"{}}>half spectrum</emphasis>, which has a one-to-one relationship with the spectrum through standard spectral decomposition, is estimated using an interpolation approach. The interpolation data consists of the values and the derivatives of the half spectrum function at a set of user-specified points, and can be easily estimated using an input-to-state filter. Our algorithm allows a large number of noisy interpolation data to be used to optimally fit a half spectrum function of a fixed order. The capability of handling large number of interpolation data makes our algorithm <emphasis emphasistype={\tt{}"{}}boldital{\tt{}"{}}>robust</emphasis> to the inherent finite sample noise in the interpolation data. The algorithm involves solving some least-squares problems and semidefinite programming problems, and is thus numerically efficient. Numerical tests show that our algorithm gives very reliable spectral estimates. </para>},
}

@article{Rao:2007,
	Author = {Rao, K. and Prasanna, S. R. and Yegnanarayana, B.},
	Doi = {10.1109/LSP.2007.896454},
	Isbn = {1070-9908},
	Journal = {{IEEE} Signal Processing Letters},
	Keywords = {Group delay function; Hilbert envelope; instants of significant excitation; linear prediction residual},
	Number = {10},
	Pages = {762--765},
	Title = {Determination of Instants of Significant Excitation in Speech Using {H}ilbert Envelope and Group Delay Function},
	Volume = {14},
	Year = {2007},
	Abstract = {This letter proposes a time-effective method for determining the instants of significant excitation in speech signals. The instants of significant excitation correspond to the instants of glottal closure (epochs) in the case of voiced speech, and to some random excitations like onset of burst in the case of nonvoiced speech. The proposed method consists of two phases: the first phase determines the approximate epoch locations using the Hilbert envelope of the linear prediction residual of the speech signal. The second phase determines the accurate locations of the instants of significant excitation by computing the group delay around the approximate epoch locations derived from the first phase. The accuracy in determining the instants of significant excitation and the time complexity of the proposed method is compared with the group delay based approach.},
}

@electronic{Lach:2007b,
	Author = {Lach, J. S.},
	Title = {A review of James Tenney's A History of `Consonance' and `Dissonance'},
	Url = {http://homepage.mac.com/jslach/Review%20of%20AHOC&D.pdf},
	Urldate = {21st Sept 2007},
}

@conference{Lach:2007,
	Address = {The Hague, Netherlands},
	Author = {Lach, J. S.},
	Booktitle = {{S}uper{C}ollider Symposium 2007},
	Month = {September},
	Title = {Compositional Applications Of Dissonance Curves},
	Year = {2007},
	Abstract = {In this talk I will describe my recent work on roughness analysis and some of its uses in making microtonal music. The research is at an early stage but it already poses interesting problems regarding the definitions and boundaries between timbre, harmony and texture in music composition.},
}

@electronic{Roweis:2001,
	Author = {Saul, L. and Roweis, S.},
	Title = {An Introduction to Locally Linear Embedding},
	Url = {http://www.cs.toronto.edu/~roweis/lle/papers/lleintroa4.pdf},
	Urldate = {13th Sept 2007},
	Year = {2001},
}

@article{Saul:2000,
	Author = {Roweis, S. and Saul, L.},
	Journal = {Science},
	Month = {Dec},
	Number = {5500},
	Pages = {2323--2326},
	Title = {Nonlinear dimensionality reduction by locally linear embedding},
	Url = {http://www.sciencemag.org/cgi/reprint/290/5500/2323.pdf},
	Volume = {290},
	Year = {2000},
}

@inbook{Abdi:2007,
	Author = {Abdi, H.},
	Chapter = {Multiple factor analysis},
	Editor = {Salkind, N.J.},
	Publisher = {Sage},
	Title = {Encyclopedia of Measurement and Statistics},
	Url = {http://www.utdallas.edu/~herve/Abdi-MFA2007-pretty.pdf},
	Year = {2007},
}

@misc{Riley:2004,
	Author = {Riley, A. and Howard, D.},
	Howpublished = {Undergraduate project report, Department of Electronics, University of York},
	Month = {June},
	Title = {A real-time tristimulus timbre synthesizer},
	Url = {http://www-users.york.ac.uk/~dmh8/papers/tristim-riley.PDF},
	Year = {2004},
	Abstract = {This report details the design and verification of a MIDI-controlled software synthesizer based on the tristimulus method for timbral specification. The tristimulus method is discussed and adapted to suit a synthesis application and the software implementation is described thoroughly. A number of added features, derived from a consideration of musical techniques and acoustic properties, are included in the design. A user-friendly graphical interface forms the front-end of the software part of the system. The requirements of an appropriate hardware control surface are considered and a prototype consisting of existing controllers is implemented and tested. Audio samples provide a demonstration of the synthesizer. },
}

@inproceedings{DAlessandro:2007,
	Address = {Copenhagen, Denmark},
	Author = {D'Alessandro, N. and Moinet, A. and Dubuisson, T. and Dutoit, T.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {465--468},
	Title = {Causal/anticausal decomposition for mixed-phase description of brass and bowed string sounds},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Baalman:2007,
	Address = {Copenhagen, Denmark},
	Author = {Baalman, M.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {9--11},
	Title = {On wave field synthesis and electro-acoustic music - state of the art 2007},
	Volume = {1},
	Year = {2007}}

@inproceedings{Rauhala:2007,
	Address = {Copenhagen, Denmark},
	Author = {Rauhala, J. and V{\"a}lim{\"a}ki, V.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {453--456},
	Title = {F0 estimation of inharmonic piano tones using partial frequencies deviation method},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Bradford:2007,
	Address = {Copenhagen, Denmark},
	Author = {Bradford, R. and Dobson, R. and Ffitch, J.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {449--452},
	Title = {The sliding phase vocoder},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Gerhard:2007,
	Address = {Copenhagen, Denmark},
	Author = {Gerhard, D. and Ellis, J.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {405--412},
	Title = {Focus-plus-context displays for audio interaction},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Birnbaum:2007,
	Address = {Copenhagen, Denmark},
	Author = {Birnbaum, D. and Wanderley, M. M.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {397--404},
	Title = {A systematic approach to musical vibrotactile feedback},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Kapur:2007,
	Address = {Copenhagen, Denmark},
	Author = {Kapur, A.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {393--396},
	Title = {A comparison of solenoid-based strategies for robotic drumming},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Collins:2007,
	Address = {Copenhagen, Denmark},
	Author = {Collins, N.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {389--392},
	Title = {Audiovisual Concatenative Synthesis},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Belanger:2007,
	Address = {Copenhagen, Denmark},
	Author = {B{\'e}langer, O. and Traube, C. and Pich{\'e}, J.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {328--331},
	Title = {Designing and controlling a source-filter model for a naturalistic and expressive singing voice synthesis},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Coleman:2007,
	Address = {Copenhagen, Denmark},
	Author = {Coleman, G.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {324--327},
	Title = {Mused: Navigating the personal sample library},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Frojd:2007,
	Address = {Copenhagen, Denmark},
	Author = {Fr{\"o}jd, M. and Horner, A.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {320--323},
	Title = {Fast sound texture synthesis using overlap-add},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Cont:2007,
	Address = {Copenhagen, Denmark},
	Author = {Cont, A. and Dubnov, S. and Assayag, G.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {252--259},
	Title = {Guidage: A fast audio query guided assemblage},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Orio:2007,
	Address = {Copenhagen, Denmark},
	Author = {Orio, N. and Zen, C.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {248--251},
	Title = {Song identification though {HMM}-based modeling of the main melody},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Somerville:2007,
	Address = {Copenhagen, Denmark},
	Author = {Somerville, P.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {240--247},
	Title = {Note-based segmentation and hierarchy in the classification of digital musical instruments},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Hazan:2007,
	Address = {Copenhagen, Denmark},
	Author = {Hazan, A. and Brossier, P. M. and Holonocowicz, P. and Herrera, P. and Purwins, H.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {228--235},
	Title = {Expectation along the beat: A use case for music expectation models},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Dubnov:2007,
	Address = {Copenhagen, Denmark},
	Author = {Dubnov, S. and Assayag, G. and Cont, A.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {224--227},
	Title = {Audio Oracle: A new algorithm for fast learning of audio structures},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Hoffman:2007,
	Address = {Copenhagen, Denmark},
	Author = {Hoffman, M. and Cook, P. R.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {184--187},
	Title = {The featsynth framework for feature-based synthesis: Design and applications},
	Volume = {2},
	Year = {2007},
}

@inproceedings{Lee:2007,
	Address = {Copenhagen, Denmark},
	Author = {Lee, N.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {450},
	Title = {Excitation signal extraction for guitar tones},
	Volume = {1},
	Year = {2007},
}

@inproceedings{Dannenberg:2007a,
	Address = {Copenhagen, Denmark},
	Author = {Dannenberg, R. B.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Title = {An Intelligent Multitrack Audio Editor},
	Volume = {1},
	Year = {2007},
}

@inproceedings{Zmolnig:2007,
	Address = {Copenhagen, Denmark},
	Author = {Zm{\"o}lnig, I.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {295--298},
	Title = {Live coding: an overview},
	Volume = {1},
	Year = {2007},
}

@inproceedings{Fontana:2007,
	Address = {Copenhagen, Denmark},
	Author = {Fontana, F.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {291-294},
	Title = {Preserving the structure of the {M}oog {VCF} in the digital domain},
	Volume = {1},
	Year = {2007},
}

@inproceedings{Puckette:2007,
	Address = {Copenhagen, Denmark},
	Author = {Puckette, M.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {287-290},
	Title = {On timbre stamps and other frequency-domain filters},
	Volume = {1},
	Year = {2007},
}

@inproceedings{Burgoyne:2007,
	Address = {Copenhagen, Denmark},
	Author = {Burgoyne, A. and McAdams, S.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {73--76},
	Title = {Non-linear scaling techniques for uncovering the perceptual dimensions of timbre},
	Volume = {1},
	Year = {2007},
}

@inproceedings{Dannenberg:2007,
	Address = {Copenhagen, Denmark},
	Author = {Dannenberg, R. B.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {31--34},
	Title = {Abstract behaviours for structured music programming},
	Volume = {1},
	Year = {2007},
}

@inproceedings{Eigenfeldt:2007,
	Address = {Copenhagen, Denmark},
	Author = {Eigenfeldt, A.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Keywords = {sawthistalk},
	Month = {August},
	Pages = {9--11},
	Title = {Drum circle: Intelligent agents in {M}ax/{MSP}},
	Volume = {1},
	Year = {2007}}

@article{Frigo:2005,
	Author = {Frigo, M. and Johnson, S. G.},
	Journal = {Proceedings of the IEEE},
	Number = {2},
	Pages = {216--231},
	Title = {The design and implementation of {FFTW3}},
	Url = {http://www.fftw.org/fftw-paper-ieee.pdf},
	Volume = {93},
	Year = {2005},
	Abstract = {FFTW is an implementation of the discrete Fourier transform (DFT) that adapts to the hardware in order to maximize performance. This paper shows that such an approach can yield an implementation that is competitive with hand-optimized libraries, and describes the software structure that makes our current FFTW3 version flexible and adaptive. We further discuss a new algorithm for real-data DFTs of prime size,
a new way of implementing DFTs by means of machine-specific ``SIMD'' instructions, and how a special-purpose compiler can derive optimized implementations of the discrete cosine and sine transforms automatically from a DFT algorithm. 
},
}

@book{Preece:2004,
	Author = {Preece, J. and Rogers, Y. and Sharp, H.},
	Keywords = {evaluation, qualitative, discourse analysis},
	Publisher = {Apogeo Editore},
	Title = {Interaction design},
	Year = {2004},
}

@article{Camps:2003,
	Author = {Camps, J.},
	Doi = {10.1111/1473-4192.00044},
	Journal = {International Journal of Applied Linguistics},
	Number = {2},
	Pages = {201--221},
	Publisher = {Blackwell Synergy},
	Title = {Concurrent and retrospective verbal reports as tools to better understand the role of attention in second language tasks},
	Volume = {13},
	Year = {2003},
	Abstract = {This study investigates how the use of think-aloud protocols, both concurrent and retrospective, can contribute to the study of the role of attention in second language acquisition. It is based on the analysis of think-aloud protocols produced by 74 first-year learners of Spanish during and immediately after a reading and multiple-choice activity. The activity consisted of a text with direct object pronouns and 16 blanks for which a choice of three possible antecedents was given. The key to making the right choice was attending to both form and meaning in the input. The data in the think-aloud protocols was classified into mentions of the pronouns in the text as well as references to gender and number agreement. The results showed that mention of the targeted structure in the think-aloud protocols was related to better performance on the task for second-semester students, but not for first-semester students. There was some difference in the results for the concurrent and retrospective protocols. The possible complementary nature of these two sources of data is discussed.},
}

@inproceedings{Tang:2002,
	Author = {Tang, H.H. and Gero, J.S.},
	Booktitle = {Creative Systems: Approaches to Creativity in {AI} and Cognitive Science, Lyon},
	Keywords = {creativity, evaluation},
	Pages = {47--54},
	Title = {A cognitive method to measure potential creativity in designing},
	Url = {http://people.arch.usyd.edu.au/~john/publications/2002/02TangGeroECAI.pdf},
	Year = {2002},
	Abstract = {Using concepts from creative cognition this paper 
proposes a cognitive method to measure potential creativity 
in designing. The empirical data from protocol studies 
presents examples of the measurement of the novelty, value, 
and unpredictability in the different cognitive levels. 
Finally, we propose a model of creativity and designing as 
situated. 
},
}

@article{Tillmann:2004,
	Author = {Tillmann, B. and McAdams, S.},
	Journal = {Learning, Memory},
	Number = {5},
	Pages = {1131--1142},
	Title = {Implicit Learning of Musical Timbre Sequences: Statistical Regularities Confronted With Acoustical (Dis) Similarities},
	Url = {http://olfac.univ-lyon1.fr/unite/equipe-02/tillmann/download/2004_TiMcA_IL.pdf},
	Volume = {30},
	Year = {2004},
	Abstract = {The present study investigated the influence of acoustical characteristics on the implicit learning of 
statistical regularities (transition probabilities) in sequences of musical timbres. The sequences were 
constructed in such a way that the acoustical dissimilarities between timbres potentially created seg- 
mentations that either supported (S1) or contradicted (S2) the statistical regularities or were neutral (S3). 
In the learning group, participants first listened to the continuous timbre sequence and then had to 
distinguish statistical units from new units. In comparison to a control group without the exposition 
phase, no interaction between sequence type and amount of learning was observed: Performance 
increased by the same amount for the three sequences. In addition, performance reflected an overall 
preference for acoustically similar timbre units. The present outcome extends previous data from the 
domain of implicit learning to complex nonverbal auditory material. It further suggests that listeners 
become sensitive to statistical regularities despite acoustical characteristics in the material that potentially 
affect grouping. 
},
}

@article{1152204,
	Address = {New York, NY, USA},
	Author = {Dubnov, S. and McAdams, S. and Reynolds, R.},
	Doi = {http://dx.doi.org/10.1002/asi.v57:11},
	Issn = {1532-2882},
	Journal = {Journal of the American Society for Information Science and Technology},
	Number = {11},
	Pages = {1526--1536},
	Publisher = {John Wiley \{\&} Sons, Inc.},
	Title = {Structural and affective aspects of music from statistical audio signal analysis: Special Topic Section on Computational Analysis of Style},
	Volume = {57},
	Year = {2006},
	Abstract = {Understanding and modeling human experience and
emotional response when listening to music are impor-
tant for better understanding of the stylistic choices in
musical composition. In this work, we explore the rela-
tion of audio signal structure to human perceptual and
emotional reactions. Memory, repetition, and anticipa-
tory structure have been suggested as some of the
major factors in music that might influence and possibly
shape these responses. The audio analysis was con-
ducted on two recordings of an extended contemporary
musical composition by one of the authors. Signal prop-
erties were analyzed using statistical analyses of signal
similarities over time and information theoretic mea-
sures of signal redundancy. They were then compared to
Familiarity Rating and Emotional Force profiles, as
recorded continually by listeners hearing the two ver-
sions of the piece in a live-concert setting. The analysis
shows strong evidence that signal properties and
human reactions are related, suggesting applications of
these techniques to music understanding and music
information-retrieval systems. 
},
}

@inproceedings{Pauletto:2005,
	Author = {Pauletto, S. and Hunt, A.},
	Booktitle = {Proceedings of the International Conference on Auditory Display (ICAD05)},
	Month = {July},
	Title = {A comparison of audio and visual analysis of complex time-series data sets},
	Url = {http://www.idc.ul.ie/icad2005/downloads/f30.pdf},
	Year = {2005},
}

@article{Smith:1999,
	Author = {Smith, J. O., III and Abel, J. S.},
	Isbn = {1063-6676},
	Journal = {{IEEE} Transactions on Speech and Audio Processing},
	Journal1 = {Speech and Audio Processing, IEEE Transactions on},
	Keywords = {all-pass filters; audio signal processing; channel bank filters; error analysis; filtering theory; signal resolution; signal sampling; spectral analysis; transfer functions; transforms; Bark bilinear transform; Bark frequency scale; ERB bilinear transform; audio filter design; audio signal processing; auditory filterbanks; bilinear conformal map; closed-form weighted-equation-error method; equivalent rectangular bandwidth scale; filter design; first-order allpass filter; first-order allpass transformation; first-order map; frequency resolution; frequency warping; model order; optimal Chebyshev mapping; optimal allpass coefficient; optimal least-squares solution; optimal mapping coefficient; psychoacoustic based spectral measures; rational systems; sampling rate; transfer function; unit circle},
	Number = {6},
	Pages = {697--708},
	Title = {Bark and ERB bilinear transforms},
	Volume = {7},
	Year = {1999},
	Abstract = {Use of a bilinear conformal map to achieve a frequency warping nearly identical to that of the Bark frequency scale is described. Because the map takes the unit circle to itself, its form is that of the transfer function of a first-order allpass filter. Since it is a first-order map, it preserves the model order of rational systems, making it a valuable frequency warping technique for use in audio filter design. A closed-form weighted-equation-error method is derived that computes the optimal mapping coefficient as a function of sampling rate, and the solution is shown to be generally indistinguishable from the optimal least-squares solution. The optimal Chebyshev mapping is also found to be essentially identical to the optimal least-squares solution. The expression 0.8517[arctan(0.06583fs)]<sup>1/2</sup>-0.916 is shown to accurately approximate the optimal allpass coefficient as a function of sampling rate f<sub>s</sub> in kHz for sampling rates greater than 1 kHz. A filter design example is included that illustrates improvements due to carrying out the design over a Bark scale. Corresponding results are also given and compared for approximating the related {\&}ldquo;equivalent rectangular bandwidth (ERB) scale{\&}rdquo; of Moore and Glasberg (ACTA Acustica, vo.82, p.335-45, 1996) using a first-order allpass transformation. Due to the higher frequency resolution called for by the ERB scale, particularly at low frequencies, the first-order conformal map is less able to follow the desired mapping, and the error is two to three times greater than the Bark-scale case, depending on the sampling rate},
}

@book{Silverman:2006,
	Author = {Silverman, D.},
	Edition = {2nd},
	Keywords = {evaluation, qualitative, interview, discourse analysis},
	Publisher = {Sage Publications Inc},
	Title = {Interpreting Qualitative Data: Methods for Analysing Talk, Text and Interaction},
	Year = {2006},
}

@inproceedings{Paine:2007,
	Author = {Paine, G. and Stevenson, I. and Pearce, A.},
	Booktitle = {Proceedings of {N}ew {I}nterfaces for {M}usical {E}xpression ({NIME})},
	Keywords = {evaluation, interfaces},
	Pages = {70--77},
	Title = {The Thummer Mapping Project ({ThuMP})},
	Url = {http://itp.nyu.edu/nime/2007/proc/nime2007_070.pdf},
	Year = {2007},
	Abstract = {This paper presents the Thummer Mapping Project (ThuMP), an 
industry partnership project between ThumMotion P/L and The 
University of Western Sydney (UWS).  ThuMP sought to 
developing mapping strategies for new interfaces for musical 
expression (NIME), specifically the ThummerTM, which 
provides thirteen simultaneous degrees of freedom. This 
research presents a new approach to the mapping problem 
resulting from a primary design research phase and a prototype 
testing and evaluation phase. In order to establish an underlying 
design approach for the ThummerTM mapping strategies, a 
number of interviews were carried out with high-level acoustic 
instrumental performers, the majority of whom play with the 
Sydney Symphony Orchestra, Sydney, Australia.  Mapping 
strategies were developed from analysis of these interviews and 
then evaluated in trial usability testing. 
},
}

@inproceedings{Gotzen:2005,
	Address = {Salerno, Italy},
	Author = {de Gotzen, A. and Serafin, S.},
	Booktitle = {Proceedings of the SMC 2005 Conference},
	Month = {November},
	Title = {The croaker: design and evaluation of a new multimodal interface},
	Url = {http://smc.afim-asso.org/smc05/papers/StefaniaSerafin/SMC05paperfin.pdf},
	Year = {2005},
	Abstract = {In this paper we introduce the Croaker, a novel input device inspired by Russolo's Intonarumori. We describe the motivations behind the design of this instrument, and its applications in human computer interaction (HCI) and music. 
},
}

@inproceedings{Young:2003,
	Address = {Singapore},
	Author = {Young, D. and Serafin, S.},
	Booktitle = {Proceedings of {N}ew {I}nterfaces for {M}usical {E}xpression ({NIME})},
	Pages = {104--108},
	Publisher = {National University of Singapore},
	Title = {Playability evaluation of a virtual bowed string instrument},
	Url = {http://www.media.mit.edu/hyperins/papers/YoungSerafin_NIME03.pdf},
	Year = {2003},
}

@article{Bach:2004,
	Author = {Bach, F. R. and Jordan, M. I.},
	Isbn = {1053-587X},
	Journal = {IEEE Transactions on Signal Processing},
	Keywords = {Gaussian processes; data structures; frequency-domain analysis; graph theory; probability; signal representation; sparse matrices; time series; Toeplitz matrices; computational complexity; conjugate gradient methods; data structure; forecasting; frequency-domain analysis; graphical model learning; kernels; local smoothing; parameter estimation; probabilistic graphical model semantics; signal representation; sparse matrices; spectral analysis; stationary Gaussian time series; Frequency domain analysis; modeling; sparse matrices; spectral analysis; statistics; time series},
	Number = {8},
	Pages = {2189--2199},
	Title = {Learning graphical models for stationary time series},
	Volume = {52},
	Year = {2004},
	Abstract = {Probabilistic graphical models can be extended to time series by considering probabilistic dependencies between entire time series. For stationary Gaussian time series, the graphical model semantics can be expressed naturally in the frequency domain, leading to interesting families of structured time series models that are complementary to families defined in the time domain. In this paper, we present an algorithm to learn the structure from data for directed graphical models for stationary Gaussian time series. We describe an algorithm for efficient forecasting for stationary Gaussian time series whose spectral densities factorize in a graphical model. We also explore the relationships between graphical model structure and sparsity, comparing and contrasting the notions of sparsity in the time domain and the frequency domain. Finally, we show how to make use of Mercer kernels in this setting, allowing our ideas to be extended to nonlinear models.},
}

@book{Lutkepohl:1993,
	Author = {L{\"u}tkepohl, H.},
	Publisher = {Springer},
	Title = {Introduction to Multiple Time Series Analysis},
	Year = {1993},
}

@article{Meng:2007,
	Author = {Meng, A. and Ahrendt, P. and Larsen, J. and Hansen, L. K.},
	Journal = {{IEEE} Transactions on Audio and Speech and Language Processing},
	Keywords = {Temporal Feature integration, autoregressive model, music genre classification},
	Month = {nov},
	Title = {Temporal Feature Integration for Music Genre Classification},
	Url = {http://www2.imm.dtu.dk/pubdb/p.php?4023},
	Year = {2007},
	Abstract = {Temporal feature integration is  the process of combining all the feature
vectors in a time frame into a single feature vector in order to
capture the relevant temporal information in the frame. The mean and variance
along the temporal dimension are often used for temporal feature
integration, but they capture neither the temporal dynamics nor
dependencies among the individual feature dimensions. Here, a
multivariate autoregressive feature model is proposed to solve
this problem for music genre classification. This model gives two
different feature sets, the {DAR} and {MAR} features, which are
compared against the baseline mean-variance as well as two other
temporal feature integration techniques. Reproducibility in
performance ranking of temporal feature integration methods
were demonstrated using two data sets with five and eleven music genres, and by using four
different classification schemes. The methods were further compared to
human performance. The proposed {MAR} features perform significantly
better than the other features without much increase in
computational complexity.},
}

@inproceedings{Moon:2007,
	Author = {Moon, H. and Arora, M. and Chung, C. and Jang, S.},
	Booktitle = {Proceedings of 122nd Audio Engineering Society Convention},
	Month = {May},
	Number = {7024},
	Title = {Enhanced Bass Reinforcement Algorithm for Small-Sized Transducer},
	Year = {2007},
	Abstract = {Nowadays, mobile devices such as cell phones or mp3 players using small-sized loudspeaker systems to supply sound events to users is very popular. The main reasons why small-sized transducers are being used are due to the design and the size of the devices. Unfortunately, their design and size restrain the transducers from high quality of low frequency performance. To breakthrough this physical barrier of poor low frequency generation, the well-known psychoacoustical background ``missing fundamental illusion'' is exploited. In this paper the method of enhancing bass perception using virtual pitch is presented. In our demonstration, listeners can feel the deep bass with fewer artifacts.},
}

@inproceedings{Buono:2005,
	Author = {Buono, P. and Aris, A. and Plaisant, C. and Khella, A. and Shneiderman, B.},
	Booktitle = {Proc. SPIE},
	Pages = {175--186},
	Title = {Interactive pattern search in time series},
	Volume = {5669},
	Year = {2005},
	Abstract = {The need for pattern discovery in long time series data led researchers to develop algorithms for similarity search. Most of the literature about time series focuses on algorithms that index time series and bring the data into the main storage, thus providing fast information retrieval on large time series. This paper reviews the state of the art in visualizing time series, and focuses on techniques that enable users to interactively query time series. Then it presents TimeSearcher 2, a tool that enables users to explore multidimensional data using coordinated tables and graphs with overview+detail, filter the time series data to reduce the scope of the search, select an existing pattern to find similar occurrences, and interactively adjust similarity parameters to narrow the result set. This tool is an extension of previous work, TimeSearcher 1, which uses graphical timeboxes to interactively query time series data. 
},
}

@article{Hochheiser:2004,
	Author = {Hochheiser, H. and Shneiderman, B.},
	Journal = {Information Visualization},
	Number = {1},
	Pages = {1--18},
	Title = {Dynamic query tools for time series data sets: Timebox widgets for interactive exploration},
	Volume = {3},
	Year = {2004},
	Abstract = {Timeboxes are rectangular widgets that can be used in direct-manipulation 
graphical user interfaces (GUIs) to specify query constraints on time series data 
sets. Timeboxes are used to specify simultaneously two sets of constraints: 
given a set of N time series profiles, a timebox covering time periods x1...x2 
(x1rx2) and values y1...y2 (y1ry2) will retrieve only those nAN that have values 
y1ryry2 during all times x1rxrx2. TimeSearcher is an information visualiza- 
tion tool that combines timebox queries with overview displays, query-by- 
example facilities, and support for queries over multiple time-varying attributes. 
Query manipulation tools including pattern inversion and `leaders {\&} laggards' 
graphical bookmarks provide additional support for interactive exploration of 
data sets. Extensions to the basic timebox model that provide additional 
expressivity include variable time timeboxes, which can be used to express 
queries with variability in the time interval, and angular queries, which search 
for ranges of differentials, rather than absolute values. Analysis of the 
algorithmic requirements for providing dynamic query performance for 
timebox queries showed that a sequential search outperformed searches based 
on geometric indices. Design studies helped identify the strengths and 
weaknesses of the query tools. Extended case studies involving the analysis of 
two different types of data from molecular biology experiments provided 
valuable feedback and validated the utility of both the timebox model and the 
TimeSearcher tool. Timesearcher is available at http://www.cs.umd.edu/hcil/timesearcher 
},
}

@inproceedings{Muller:2003,
	Author = {M{\"{u}}ller, Wolfgang and Schumann, Heidrun},
	Booktitle = {WSC '03: Proceedings of the 35th conference on Winter simulation},
	Isbn = {0-7803-8132-7},
	Location = {New Orleans, Louisiana},
	Pages = {737--745},
	Publisher = {Winter Simulation Conference},
	Title = {Visualization methods for time-dependent data - an overview},
	Year = {2003},
	Abstract = {Visualization has been successfully applied to analyse time-dependent data for a long time now. Lately, a number of new approaches have been introduced, promising more effective graphs especially for large datasets and multiparameter data. In this paper, we give an overview on the visualization of time-series data and the available techniques. We provide a taxonomy and discuss general aspects of time-dependent data. After an overview on conventional techniques we discuss techniques for analysing time-dependent multivariate data sets in more detail. After this, we give an overview on dynamic presentation techniques and event-based visualization.},
}

@inproceedings{Carlis:1998,
	Author = {Carlis, J. V. and Konstan, J. A.},
	Booktitle = {Proceedings of the 11th annual ACM symposium on User interface software and technology},
	Doi = {10.1145/288392.288399},
	Pages = {29--38},
	Publisher = {ACM Press New York, NY, USA},
	Title = {Interactive visualization of serial periodic data},
	Year = {1998},
	Abstract = {Serial periodic data exhibit both serial and periodic
properties.  For example, time continues forward serially,
but weeks, months, and years are periods that recur.
While there are extensive visualization techniques for
exploring serial data, and a few for exploring periodic
data, no existing technique simultaneously displays serial
and periodic attributes of a data set.  We introduce a spiral
visualization technique, which displays data along a spiral
to highlight serial attributes along the spiral axis and
periodic ones along the radii.  We show several
applications of the spiral visualization to data exploration
tasks, present our implementation, discuss the capacity for
data analysis, and present findings of our informal study
with users in data-rich scientific domains. 
},
}

@techreport{Gerhard:2003,
	Author = {Gerhard, D.},
	Institution = {Dept. of Computer Science, University of Regina},
	Keywords = {pitch determination, pitch estimation, fundamental frequency},
	Number = {TR-CS 2003-06},
	Title = {Pitch Extraction and Fundamental Frequency: History and Current Techniques},
	Url = {http://www.cs.uregina.ca/Research/Techreports/2003-06.pdf},
	Year = {2003},
	Abstract = {Pitch extraction (also called fundamental frequency estimation) has been a popular topic in many fields of research since the age of computers. Yet in the course of some 50 years of study, current techniques are still not to a desired level of accuracy and robustness. When presented with a single clean pitched signal, most techniques do well, but when the signal is noisy, or when there are multiple pitch streams, many current pitch algorithms still fail to perform well. This report presents a discussion of the history of pitch detection techniques, as well as a survey of the current state of the art in pitch detection technology. 
},
}

@book{Silverman:2005,
	Author = {Silverman, D.},
	Edition = {2nd},
	Isbn = {1412901979},
	Keywords = {qualitative, interview, evaluation, discourse analysis},
	Publisher = {Sage Publications Inc},
	Title = {Doing Qualitative Research: a Practical Handbook},
	Url = {http://books.google.com/books?id=gs2yhThw4vMC&dq=%22doing+qualitative+research%22+silverman},
	Year = {2005},
	Abstract = {Written in a lively, accessible style, this step-by-step guide provides answers to all the questions students ask when beginning their first research project. David Silverman demonstrates how to learn the craft of qualitative research by applying knowledge about different methodologies to actual data. He provides practical advice on key issues, such as: defining originality' and narrowing down a topic; keeping a research diary and writing a research report; and presenting research to different audiences. Packed with case studies and examples of students' experiences, the book has many features to aid study, including overviews, summaries of key skills and a glossary of terms. Each stage in the research process is grounded in worked examples, with exercises designed both to test students' knowledge and to encourage the development of practical skills.},
}

@inproceedings{Dobrian:2006,
	Author = {Dobrian, C. and Koppelman, D.},
	Booktitle = {Proceedings of {N}ew {I}nterfaces for {M}usical {E}xpression ({NIME})},
	Pages = {277--282},
	Publisher = {IRCAM, Centre Pompidou Paris, France},
	Title = {The {`E'} in {NIME}: musical expression with new computer interfaces},
	Url = {http://www.nime.org/2006/proc/nime2006_277.pdf},
	Year = {2006},
	Abstract = {Is there a distinction between New Interfaces for Musical Expression and New Interfaces for Controlling Sound? This article begins with a brief overview of expression in musical performance, and examines some of the characteristics of effective ``expressive'' computer music instruments. It becomes apparent that sophisticated musical expression requires not only a good control interface but also virtuosic mastery of the instrument it controls. By studying effective acoustic instruments, choosing intuitive but complex gesture-sound mappings that take advantage of established instrumental skills, designing intelligent characterizations of performance gestures, and promoting long-term dedicated practice on a new interface, computer music instrument designers can enhance the expressive quality of computer music performance. 
},
}

@inproceedings{Arroabarren:2002,
	Author = {Arroabarren, I. and Zivanovic, M. and Carlosena, A.},
	Booktitle = {Proceedings of the 11th European Signal Processing Conference (EUSIPCO'02)},
	Keywords = {vibrato},
	Title = {Analysis and synthesis of vibrato in lyric singers},
	Url = {http://www.eurasip.org/content/Eusipco/2002/articles/paper205.pdf},
	Year = {2002},
	Abstract = {In this paper two aspects of the vibrato signal characterisation are 
presented. First, an analysis method, which breaks down the musical 
signal into its different partials, and decomposes them into two 
separate contributions: AM and FM. This allows an objective 
characterization of the vibrato according to musical parameters. 
Second, a procedure for vibrato synthesis is proposed, based on the 
previous analysis. Even though the results of the synthesis are not 
fully satisfactory, they shed new light on the vibrato modeling. 
},
}

@inproceedings{herrera1998vea,
	Address = {Barcelona, Spain},
	Author = {Herrera, P. and Bonada, J.},
	Booktitle = {Proceedings of the First Digital Audio Effects Workshop (DAFX98)},
	Keywords = {vibrato},
	Title = {Vibrato extraction and parameterization in the spectral modeling synthesis framework},
	Url = {http://www.iua.upf.es/mtg/publications/dafx98-perfe.pdf},
	Year = {1998},
	Abstract = {Periodic or quasi-periodic low-frequency components (i.e. vibrato and tremolo) are present in steady- 
state portions of sustained instrumental sounds. If we are interested both in studying its expressive 
meaning, or in building a hierarchical multi-level representation of sound in order to manipulate it 
and transform it with musical purposes those components should be isolated and separated from the 
amplitude and frequency envelopes. Within the SMS analysis framework it is now feasible to extract 
high level time-evolving attributes starting from basic analysis data. In the case of frequency 
envelopes we can apply STFTs to them, then check if there is a prominent peak in the 
vibrato/tremolo range and, if it is true, we can smooth it away in the frequency domain; finally, we 
can apply an IFFT to each frame in order to re-construct an envelope that has been cleaned of those 
quasi-periodic low-frequency components. Two important problems nevertheless have to be tackled, 
and ways of overcoming them will be discussed in this paper: first, the periodicity of vibrato and 
tremolo, that  is quite exact only when the performers are professional musicians; second: the 
interactions between formants and fundamental frequency trajectories, that blur the real tremolo 
component and difficult its analysis.  
},
}

@mastersthesis{Wells:1962,
	Author = {Wells, J. G.},
	School = {University of London},
	Title = {A Study of the Formants of the Pure Vowels of {B}ritish {E}nglish},
	Url = {http://www.phon.ucl.ac.uk/home/wells/formants/index-uni.htm},
	Year = {1962},
	Abstract = {The results are presented of measurements made on certain vowel sounds. Twenty-five male speakers of British English (Received Pronunciation) acted as subjects. The vowels investigated were those of the words heed, hid, head, had, hard, hod, haw'd, hood, who'd, Hud, and heard.

Vowel sounds are characterized acoustically by formants, which are frequency regions of high energy concentration corresponding to the passbands of the throat and mouth cavities. The frequency and amplitude of each of the lowest three formants of each vowel were investigated by means of the acoustic spectrograph, and their averages (together, in the case of frequencies, with their dispersions) for the 25 subjects calculated. Measurements were also made of the duration of the vowels concerned.

The results obtained, which are summarized in Table 1, are compared with the results of a similar analysis of General American vowels.},
}

@misc{Poupyrev:2001,
	Author = {Poupyrev, I. and Lyons, M. J. and Fels, S. and Blaine, T.},
	Howpublished = {Workshop proposal},
	Keywords = {nime, expressive interfaces, evaluation},
	Title = {{N}ew {I}nterfaces for {M}usical {E}xpression},
	Url = {http://www.nime.org/2001/docs/proposal.pdf},
	Year = {2001},
	Abstract = {The rapid evolution of electronics, digital media, advanced materials, and other areas of technology, is opening up unprecedented opportunities for musical interface inventors and designers. The possibilities afforded by these new technologies carry with them the challenges of a complex and often confusing array of choices for musical composers and performers. New musical technologies are at least partly responsible for the current explosion of new musical forms, some of which are controversial and challenge traditional definitions of music. Alternative musical controllers, currently the leading edge of the ongoing dialogue between technology and musical culture, involve many of the issues covered at past CHI meetings. 
},
}

@inproceedings{Severin:2005,
	Author = {Severin, F. and Bozkurt, B. and Dutoit, T.},
	Booktitle = {Proceedings of Eusipco 2005 (13th European Signal Processing Conference)},
	Title = {{HNR} extraction in voiced speech, oriented towards voice quality analysis},
	Url = {http://www.arehna.di.uoa.gr/Eusipco2005/defevent/papers/cr1394.pdf},
	Year = {2005},
	Abstract = {This study tests three methods (algorithms of G. de Krom, C. d'Alessandro et al. and P. Boersma) to estimate the Harmonics-to-Noise Ratio (HNR) in speech. Tests are made on two databases of naturally connected speech designed for voice quality analysis. First, results of the three methods are compared, then the relevance of each method is analysed separately. The conclusion is that they are all good indicators of the amount of noise in speech, and though their accuracy is limited, they are efficient for voice quality analysis.},
}

@article{vincent2007nms,
	Author = {Vincent, D. and Rosec, O. and Chonavel, T.},
	Doi = {10.1109/ICASSP.2007.366965},
	Journal = {Acoustics, Speech and Signal Processing, 2007. ICASSP 2007. IEEE International Conference on},
	Pages = {IV-525--IV-528},
	Title = {A New Method for Speech Synthesis and Transformation Based on an {ARX-LF} Source-Filter Decomposition and {HNM} Modeling},
	Volume = {4},
	Year = {2007},
	Abstract = {In this paper a new method for speech synthesis is proposed. It relies on a source-filter decomposition of the speech signal by means of an ARX-LF model. This model allows the representation of the glottal signal as the sum of an LF wavefrom and a residual signal. The residual information is then analyzed by HNM. This signal representation enables high quality speech modification such as pitch, duration or even voice quality transformation. Experiments performed on a real speech database show the relevance of the proposed method as compared to other existing approaches.},
}

@article{Grassberger:1983,
	Author = {Grassberger, P. and Procaccia, I.},
	Doi = {10.1016/0167-2789(83)90298-1},
	Journal = {Physica D: Nonlinear Phenomena},
	Keywords = {fractals, fractal dimension},
	Number = {1-2},
	Pages = {189--208},
	Title = {Measuring the strangeness of strange attractors},
	Url = {http://www.sciencedirect.com/science/article/B6TVK-46JYPW4-50/2/621ece0782ffb2c77e4bf4180d76a459},
	Volume = {9},
	Year = {1983},
	Abstract = {We study the correlation exponent v introduced recently as a characteristic measure of strange attractors which allows one to distinguish between deterministic chaos and random noise. The exponent v is closely related to the fractal dimension and the information dimension, but its computation is considerably easier. Its usefulness in characterizing experimental data which stem from very high dimensional systems is stressed. Algorithms for extracting v from the time series of a single variable are proposed. The relations between the various measures of strange attractors and between them and the Lyapunov exponents are discussed. It is shown that the conjecture of Kaplan and Yorke for the dimension gives an upper bound for v. Various examples of finite and infinite dimensional systems are treated, both numerically and analytically.},
}

@article{Theiler:1990,
	Author = {Theiler, J.},
	Journal = {Optical Society of America, Journal, A: Optics and Image Science},
	Pages = {1055--1073},
	Publisher = {OSA},
	Title = {Estimating fractal dimension},
	Url = {http://public.lanl.gov/jt/Papers/est-fractal-dim.pdf},
	Volume = {7},
	Year = {1990},
	Abstract = {The nature of chaos and strange attractors is reviewed, and definitions of fractal dimensions are examined. Algorithms for estimating fractal dimensions are discussed. The implementation of box-counting algorithms and of the correlation algorithm for estimating fractal dimensions is addressed.
},
}

@article{Grey:1978b,
	Author = {Grey, J. M.},
	Doi = {10.1121/1.382018},
	Journal = {The Journal of the Acoustical Society of America},
	Number = {2},
	Pages = {467--472},
	Publisher = {ASA},
	Title = {Timbre discrimination in musical patterns},
	Volume = {64},
	Year = {1978},
	Abstract = {Most research on timbre perception has studied isolated tones. This study compares timbre discrimination of isolated tones with discrimination in various musical contexts, both single-voiced and multivoiced. Twelve different contexts were used (four isolated tonal comparisons, four single-voice musical patterns, and four multivoice patterns). Listeners judged whether the timbre remained the same or changed during the trial. Two possible versions of any instrumental timbre differed in the physical information used in thier synthesis. Three instrumental timbres were tested in all contexts: clarinet, trumpet, and bassoon. The effects of context upon discrimination varied across instruments. The clarinet and trumpet versions were best discriminated in isolated contexts, with discrimination progressively worse in single-voice and multivoice patterns. The bassoon versions were best discriminated in the single-voice patterns, with equal discrimination in the isolated and multivoice cases. It is suggested that these results were due to pronounced physical differences observed between the spectra of the two versions of the bassoon that were not apparent between the versions of the clarinet or trumpet.},
}

@article{Grey:1978,
	Author = {Grey, J. M. and Gordon, J. W.},
	Doi = {10.1121/1.381843},
	Journal = {The Journal of the Acoustical Society of America},
	Number = {5},
	Pages = {1493--1500},
	Publisher = {ASA},
	Title = {Perceptual effects of spectral modifications on musical timbres},
	Volume = {63},
	Year = {1978},
	Abstract = {An experiment was performed to evaluate the effects of spectral modifications on the similarity structure for a set of musical timbres. The stimuli were 16 music instrument tones, 8 of which were modified in pairs. This modification consisted of exchanging the shape of the spectral energy distribution between the two tones within each pair. The three-dimensional spatial representation of similarities among the 16 tones was obtained by multidimensional scaling techniques and compared to a previous scaling of the original 16 unmodified tones [J. M. Grey, J. Acoust. Soc. Am. 61, 1270--1277 (1977)]. The pairs of tones which had exchanged spectral shapes in fact exchanged orders on the spatial axis which had been previously interpreted as relating to spectral shape, thereby supporting the earlier interpretation. The two remaining axes of the spatial solution also retained their original interpretations, relating to various temporal details of the tones. A set of formal quantitative models for the spectral dimension was constructed and tested, and the results further supported the interpretation of this perceptual axis.},
}

@article{Grey:1977,
	Author = {Grey, J. M.},
	Doi = {10.1121/1.381428},
	Journal = {The Journal of the Acoustical Society of America},
	Number = {5},
	Pages = {1270--1277},
	Publisher = {ASA},
	Title = {Multidimensional perceptual scaling of musical timbres},
	Volume = {61},
	Year = {1977},
	Abstract = {Two experiments were performed to evaluate the perceptual relationships between 16 music instrument tones. The stimuli were computer synthesized based upon an analysis of actual instrument tones, and they were perceptually equalized for loudness, pitch, and duration. Experiment 1 evaluated the tones with respect to perceptual similarities, and the results were treated with multidimensional scaling techniques and hierarchic clustering analysis. A three-dimensional scaling solution, well matching the clustering analysis, was found to be interpretable in terms of (1) the spectral energy distribution; (2) the presence of synchronicity in the transients of the higher harmonics, along with the closely related amount of spectral fluctuation within the the tone through time; and (3) the presence of low-amplitude, high-frequency energy in the initial attack segment; an alternate interpretation of the latter two dimensions viewed the cylindrical distribution of clusters of stimulus points about the spectral energy distribution, grouping on the basis of musical instrument family (with two exceptions). Experiment 2 was a learning task of a set of labels for the 16 tones. Confusions were examined in light of the similarity structure for the tones from experiment 1, and one of the family-grouping exceptions was found to be reflected in the difficulty of learning the labels.},
}

@article{Houtsma:1990,
	Author = {Houtsma, A. J. M. and Smurzynski, J.},
	Doi = {10.1121/1.399297},
	Journal = {The Journal of the Acoustical Society of America},
	Keywords = {pitch perception, missing fundamental},
	Pages = {304},
	Publisher = {ASA},
	Title = {Pitch identification and discrimination for complex tones with many harmonics},
	Volume = {87},
	Year = {1990},
	Abstract = {Four experiments are reported that deal with pitch perception of harmonic complex tones containing up to 11 successive harmonics. In particular, the question is raised whether the pitch percept of the missing fundamental is mediated only by low-order resolvable harmonics, or whether it can also be conveyed by high-order harmonics that the cochlea fails to resolve. Melodic interval identification performance was found to remain significantly above chance level even if the range of harmonics extended from the 20th to the 30th. Just-noticeable differences (jnd) in the pitch of the missing fundamental were found to increase with increasing harmonic order, but to level off when all harmonics are above the 12th. These results are consistent with the notion of the existence of two distinct neural pitch mechanisms in the auditory system, but are, in principle, also compatible with a single central-spectrum mechanism that uses the interspike interval histograms of auditory-nerve fibers as inputs.},
}

@article{Smith:1978,
	Author = {Smith, J. C. and Marsh, J. T. and Greenberg, S. and Brown, W. S.},
	Doi = {10.1126/science.675250},
	Journal = {Science},
	Keywords = {missing fundamental, pitch perception},
	Number = {4356},
	Pages = {639},
	Title = {Human auditory frequency-following responses to a missing fundamental},
	Volume = {201},
	Year = {1978},
	Abstract = {Both a complex tone perceived as a 365-hertz ``missing fundamental'' and a 365-hertz pure tone evoked 365-hertz far-field frequency-following responses. Narrow-band masking noise centered at 365 hertz attenuated the responses to the pure tone but not to the complex tone. Results support the concept that perception of the missing fundamental is based on periodic neural activity.},
}

@inproceedings{Mitre:2006,
	Author = {Mitre, A. and Queiroz, M. and Faria, R. R. A.},
	Booktitle = {Proceedings of the 4th AES Brazil Conference},
	Keywords = {pitch, pitch estimation},
	Title = {Accurate and Efficient Fundamental Frequency Determination from Precise Partial Estimates},
	Url = {http://www.ime.usp.br/~mqz/Mitre_AESBR2006.pdf},
	Year = {2006},
	Abstract = {An algorithm is presented for the estimation of the fundamental frequency (F0 ) of monophonic sounds. The method relies upon accurate partial estimates, obtained on a frame basis by means of enhanced Fourier analysis. The use of state-of-the-art sinusoidal estimators allows the proposed algorithm to work with frames of minimum length (i.e., about two fundamental periods). The accuracy of method does not degrade for high pitched sounds, making it suitable for musical sounds.
},
}

@article{Cheveigne:2002,
	Author = {de Cheveign{\'e}, A. and Kawahara, H.},
	Doi = {10.1121/1.1458024},
	Journal = {The Journal of the Acoustical Society of America},
	Pages = {1917},
	Title = {{YIN}, a fundamental frequency estimator for speech and music},
	Url = {http://recherche.ircam.fr/equipes/pcm/cheveign/pss/2002_JASA_YIN.pdf},
	Volume = {111},
	Year = {2002},
	Abstract = {An algorithm is presented for the estimation of the fundamental frequency ( F 0 ) of speech or 
musical sounds. It is based on the well-known autocorrelation method with a number of 
modifications that combine to prevent errors. The algorithm has several desirable features. Error 
rates are about three times lower than the best competing methods, as evaluated over a database of 
speech recorded together with a laryngograph signal. There is no upper limit on the frequency 
search range, so the algorithm is suited for high-pitched voices and music. The algorithm is 
relatively simple and may be implemented efficiently and with low latency, and it involves few 
parameters that must be tuned. It is based on a signal model  periodic signal  that may be extended 
in several ways to handle various forms of aperiodicity that occur in particular applications. Finally, 
interesting parallels may be drawn with models of auditory processing.},
}

@book{Hess:1983,
	Author = {Hess, W.},
	Publisher = {Springer-Verlag New York},
	Title = {Pitch determination of speech signals},
	Year = {1983},
}

@inproceedings{Cheveigne:2003,
	Author = {de Cheveign\'{e}, A. and Baskind, A.},
	Booktitle = {Proceedings of Eurospeech 2003},
	Keywords = {pitch, pitch estimation, YIN},
	Title = {$F_0$ estimation of one or several voices},
	Url = {http://recherche.ircam.fr/equipes/pcm/cheveign/pss/2003_eurospeech.pdf},
	Year = {2003},
	Abstract = {A methodology is presented for fundamental frequency estimation of one or more voices. The signal is modeled as the sum of one or more periodic signals, and the parameters estimated by search with interpolation. Accurate, reliable estimates are obtained for each frame without tracking or continuity constraints, and without the use of specific instrument models (although their use might further boost performance). In formal evaluation over a large database of speech, the single-voice algorithm outperformed the best competing methods by a factor of three. 

},
}

@inproceedings{Dobrian:2004,
	Author = {Dobrian, C.},
	Booktitle = {Proceedings of the 2004 Sound and Music Computing conference (SMC04)},
	Title = {Strategies for Continuous Pitch and Amplitude Tracking in Real-Time Interactive Improvisation Software},
	Url = {http://recherche.ircam.fr/equipes/repmus/SMC04/scm04actes/P32.pdf},
	Year = {2004},
}

@inproceedings{Schwarz:2005,
	Author = {Schwarz, D.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC)},
	Title = {Current research in Concatenative Sound Synthesis},
	Url = {http://recherche.ircam.fr/equipes/analyse-synthese/schwarz/publications/icmc2005/Schwarz_ICMC2005_Current-Research.pdf},
	Year = {2005},
	Abstract = {Concatenative synthesis is a promising method of musical sound synthesis with a steady stream of work and publications in recent years. It uses a large database of sound snippets to assemble a given target phrase. We explain its principle and components and its main applications, and compare existing concatenative synthesis approaches. We then list the most urgent problems for further work on concatenative synthesis.},
}

@article{Roads:1988,
	Author = {Roads, C.},
	Doi = {10.2307/3679937},
	Journal = {Computer Music Journal},
	Number = {2},
	Pages = {11--13},
	Title = {Introduction to {G}ranular {S}ynthesis},
	Volume = {12},
	Year = {1988},
}

@inproceedings{fowler1993a,
	Address = {LOS ALAMITOS},
	Author = {Fowler, J. E. and Ahalt, S. C.},
	Booktitle = {IEEE Data Compression Conference},
	Keywords = {vector quantization},
	Organization = {IEEE},
	Pages = {361--370},
	Publisher = {IEEE COMPUTER SOC},
	Title = {Robust, variable bit-rate coding using entropy-biased codebooks},
	Year = {1993},
	Abstract = {We demonstrate the use of a Differential Vector Quantization (DVQ) architecture for 
the coding of digital images. An Artificial Neural Network (ANN) is used to develop 
entropy-biased codebooks which yield substantial data compression without entropy 
coding and are very robust with respect to transmission channel errors. We discuss 
how these codebooks can be used to realize variable bit-rate coders. Two methods are 
presented for variable bit-rate coding using the described DVQ algorithm. In the first 
method, both the encoder and the decoder have multiple codebooks of different sizes. 
In the second, variable bit-rates are achieved by using subsets of one fixed codebook. 
We compare the performance of these approaches under conditions of error-free and 
error-prone channels. Our results show that this coding technique is very resistant to 
channel errors, and yields pictures of excellent visual quality at moderate compression 
rate. 
},
}

@inproceedings{Ding:2004,
	Author = {Ding, C. and He, X.},
	Booktitle = {Proc. of Int'l Conf. Machine Learning (ICML 2004)},
	Journal = {ACM International Conference Proceeding Series},
	Keywords = {PCA, k-means},
	Publisher = {ACM Press New York, NY, USA},
	Title = {K-means clustering via {P}rincipal {C}omponent {A}nalysis},
	Url = {http://crd.lbl.gov/~cding/papers/KmeansPCA1.pdf},
	Year = {2004},
	Abstract = {Principal component analysis (PCA) is a 
widely used statistical technique for unsuper- 
vised dimension reduction. K -means cluster- 
ing is a commonly used data clustering for 
unsupervised learning tasks. Here we prove 
that principal components are the continuous 
solutions to the discrete cluster membership 
indicators for K -means clustering. Equiva- 
lently, we show that the subspace spanned 
by the cluster centroids are given by spec- 
tral expansion of the data covariance matrix 
truncated at K 
− 1 terms. These results indi- 
cate that unsupervised dimension reduction 
is closely related to unsupervised learning. 
On dimension reduction, the result provides 
new insights to the observed effectiveness of 
PCA-based data reductions, beyond the con- 
ventional noise-reduction explanation. Map- 
ping data points into a higher dimensional 
space via kernels, we show that solution for 
Kernel K -means is given by Kernel PCA. On 
learning, our results suggest effective tech- 
niques for K -means clustering. DNA gene 
expression and Internet newsgroups are ana- 
lyzed to illustrate the results. Experiments 
indicate that newly derived lower bounds for 
K -means ob jective are within 0.5-1.5% of the 
optimal values. 
},
}

@article{wanderley2002eid,
	Author = {Wanderley, M. M. and Orio, N.},
	Doi = {10.1162/014892602320582981},
	Journal = {Computer Music Journal},
	Keywords = {interfaces, evaluation, expressive interfaces},
	Number = {3},
	Pages = {62--76},
	Publisher = {MIT Press Cambridge, MA, USA},
	Title = {Evaluation of Input Devices for Musical Expression: Borrowing Tools from {HCI}},
	Url = {http://muse.jhu.edu/journals/computer_music_journal/v026/26.3wanderley.pdf},
	Volume = {26},
	Year = {2002},
}

@inproceedings{Hunt:1999,
	Author = {Hunt, A. and Kirk, R.},
	Booktitle = {Proceedings of the Euromicro 99 Conference},
	Doi = {10.1109/EURMIC.1999.794755},
	Keywords = {interfaces, expressive interfaces},
	Title = {Radical user interfaces for real-time control},
	Url = {http://ieeexplore.ieee.org/iel5/6447/17218/00794755.pdf?arnumber=794755},
	Year = {1999},
	Abstract = {This paper describes recent work which challenges the
predominance of the WIMP (Windows-Icons-Menus-
Pointers) computer interface for use in real-time
situations. The results of the work have implications for
the design of user-interfaces for real-time control tasks
(of which musical performance and experimentation are
clear examples).  This paper describes the tests, the
interfaces, and the results from a variety of test subjects
over several weeks.  It then draws conclusions about the
appropriateness of commonly accepted interfaces for
complex and creative tasks. 
},
}

@inproceedings{Cook:2001,
	Author = {Cook, P.},
	Keywords = {interfaces, expressive interfaces},
	Title = {Principles for Designing Computer Music Controllers},
	Url = {http://soundlab.cs.princeton.edu/publications/prc_chi2001.pdf},
	Year = {2001},
}

@inproceedings{Poepel:2005,
	Author = {Poepel, C.},
	Booktitle = {Proceedings of {N}ew {I}nterfaces for {M}usical {E}xpression ({NIME})},
	Keywords = {evaluation, performance, interfaces, expressive interfaces},
	Pages = {228--231},
	Title = {On Interface Expressivity: A Player-Based Study},
	Url = {http://hct.ece.ubc.ca/nime/2005/proc/nime2005_228.pdf},
	Year = {2005},
	Abstract = {
While many new interfaces for musical expression have been pre- 
sented in the past, methods to evaluate these interfaces are rare. 
This paper presents a method and a study comparing the potential 
for musical expression of different string-instrument based musical 
interfaces. Cues for musical expression are defined based on re- 
sults of research in musical expression and on methods for musical 
education in instrumental pedagogy. Interfaces are evaluated ac- 
cording to how well they are estimated to allow players making use 
of their existing technique for the creation of expressive music. 
},
}

@article{botto:1989,
	Author = {Botto, J. -L. and Moustakides, G. V.},
	Isbn = {0096-3518},
	Journal = {Acoustics, Speech, and Signal Processing},
	Journal1 = {Acoustics, Speech, and Signal Processing {$[$}see also IEEE Transactions on Signal Processing{$]$}, IEEE Transactions on},
	Keywords = {Kalman filters; filtering and prediction theory; fast Kalman algorithms; filtering; minimization; roundoff errors},
	Number = {9},
	Pages = {1342--1348},
	Title = {Stabilizing the fast Kalman algorithms},
	Ty = {JOUR},
	Url = {http://ieeexplore.ieee.org/iel1/29/1349/00031289.pdf?tp=&isnumber=1349&arnumber=31289},
	Volume = {37},
	Year = {1989},
	Abstract = {The fast Kalman algorithms are stabilized by introducing a quantity that measures the accumulation of the roundoff errors. This quantity is used to correct the variables of the algorithm at every time step. The correction is defined as the solution of a specific minimization problem. The resulting algorithm still has the nice complexity properties of the original algorithm (linear in the number of parameters to be estimated), but it has a much more stable behavior.},
}

@misc{Stowell:2007b,
	Author = {Stowell, D.},
	Howpublished = {Musical work created for NetVoTech concert of live voice and technology},
	Month = {July},
	Title = {8-bit larynx},
	Year = {2007}}

@inproceedings{Arenas-Garcia:2006,
	Author = {Arenas-Garcia, J. and Larsen, J. and Hansen, L. K. and Meng, A.},
	Booktitle = {Proceedings of the 7th International Symposium on Music Information Retrieval (ISMIR)},
	Keywords = {Music organization, filter bank model, positive constrained OPLS},
	Title = {Optimal filtering of dynamics in short-time features for music organization},
	Url = {http://ismir2006.ismir.net/PAPERS/ISMIR0643_Paper.pdf},
	Year = {2006},
	Abstract = {There is an increasing interest in customizable methods for 
organizing music collections. Relevant music characteriza- 
tion can be obtained from short-time features, but it is not 
obvious how to combine them to get useful information. 
In this work, a novel method, denoted as the Positive Con- 
strained Orthonormalized Partial Least Squares (POPLS), is 
proposed. Working on the periodograms of MFCCs time 
series, this supervised method finds optimal filters which 
pick up the most discriminative temporal information for 
any music organization task. Two examples are presented in 
the paper, the first being a simple proof-of-concept, where 
an altosax with and without vibrato is modelled. A more 
complex 11 music genre classification setup is also inves- 
tigated to illustrate the robustness and validity of the pro- 
posed method on larger datasets. Both experiments showed 
the good properties of our method, as well as superior per- 
formance when compared to a fixed filter bank approach 
suggested previously in the MIR literature. We think that 
the proposed method is a natural step towards a customized 
MIR application that generalizes well to a wide range of dif- 
ferent music organization tasks. 
},
}

@phdthesis{Meng:2006,
	Address = {Richard Petersens Plads, Building 321, {DK-}2800 Kgs. Lyngby},
	Author = {Meng, A.},
	Keywords = {temporal feature integration, summarising frame information},
	Note = {Supervised by Jan Larsen and Lars Kai Hansen, {IMM}.},
	School = {Informatics and Mathematical Modelling, Technical University of Denmark, {DTU}},
	Title = {Temporal Feature Integration for Music Organisation},
	Url = {http://www2.imm.dtu.dk/pubdb/p.php?4502},
	Year = {2006},
	Abstract = {This Ph.D. thesis focuses on temporal feature integration for music organisation. Temporal feature integration is the process of combining all the feature vectors of a given time-frame into a single new feature vector in order to capture relevant information in the frame. Several existing methods for handling sequences of features are formulated in the temporal feature integration framework. Two datasets for music genre classification have been considered as valid test-beds for music organisation. Human evaluations of these, have been obtained to access the subjectivity on the datasets.



Temporal feature integration has been used for ranking various short-time features at different time-scales. This include short-time features such as the Mel frequency cepstral coefficients (MFCC), linear predicting coding coefficients (LPC) and various {MPEG-}7 short-time features. The consensus sensitivity ranking approach is proposed for ranking the short-time features at larger time-scales according to their discriminative power in a music genre classification task.



The multivariate {AR} ({MAR}) model has been proposed for temporal feature integration. It effectively models local dynamical structure of the short-time features. 



Different kernel functions such as the convolutive kernel, the product probability kernel and the symmetric Kullback Leibler divergence kernel, which measures similarity between frames of music have been investigated for aiding temporal feature integration in music organisation. A special emphasis is put on the product probability kernel for which the {MAR} model is derived in closed form. A thorough investigation, using robust machine learning methods, of the {MAR} model on two different music genre classification datasets, shows a statistical significant improvement using this model in comparison to existing temporal feature integration models. This improvement was more pronounced for the larger and more difficult dataset. Similar findings where observed using the {MAR} model in a product probability kernel. The {MAR} model clearly outperformed the other investigated density models: the multivariate Gaussian model and the Gaussian mixture model.},
}

@inproceedings{Adelman-Larsen:2007,
	Author = {Adelman-Larsen, N. and Thompson, E. R. and Gade, A. C.},
	Booktitle = {Proceedings of 122nd Audio Engineering Society Convention},
	Month = {May},
	Title = {Acoustics in Rock and Pop Music Halls},
	Year = {2007},
	Abstract = {The existing body of literature regarding the acoustic design of concert halls has focused almost exclusively on classical music, although there are many more performances of rhythmic music, including rock and pop. Objective measurements were made of the acoustics of twenty rock music venues in Denmark and a questionnaire was used in a subjective assessment of those venues with professional rock musicians and sound engineers. Correlations between the measurements and the questionnaire answers lead, among others, to a recommendation for reverberation time as a function of hall volume. Since the bass frequency sounds are typically highly amplified, they play an important role in the subjective ratings, and the 63 Hz band must be included in objective measurements and recommendations.},
}

@inproceedings{Stowell:2007a,
	Author = {Stowell, D. and Plumbley, M. D.},
	Booktitle = {Proceedings of the Digital Music Research Network (DMRN) Summer Conference},
	Keywords = {timbre, timbral analysis, performance, Real-time},
	Month = {July},
	Title = {Pitch-aware real-time timbral remapping},
	Url = {http://www.elec.qmul.ac.uk/digitalmusic/papers/2007/StowellPlumbley07-dmrn.pdf},
	Year = {2007},
	Abstract = {We propose an approach to timbral remapping, a process which maps the timbre variations of one audio source onto the timbre variations of another, for real-time control of synthesis. Puckette [17] has made a foray into such a concept, but there are two important issues which must 
be addressed: how best to construct the timbre space for remapping purposes; and how to perform this remapping efficiently in real-time. We review some of the acoustical features used in the literature to represent timbre, and consider how to combine these usefully. We also describe some of our recent work on warping timbre space for effective coverage, and on an optimised real-time database lookup procedure suitable for use in live remapping. 
},
}

@inproceedings{Stowell:2007,
	Author = {Stowell, D. and Plumbley, M. D.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'07)},
	Month = {August},
	Pages = {312--319},
	Title = {Adaptive whitening for improved real-time audio onset detection},
	Url = {http://www.elec.qmul.ac.uk/digitalmusic/papers/2007/StowellPlumbley07-icmc.pdf},
	Volume = {2},
	Year = {2007},
	Abstract = {We describe a new method for preprocessing STFT phase-vocoder frames for improved performance in real-time onset detection, which we term ``adaptive whitening''. The procedure involves normalising the magnitude of each bin according to a recent maximum value for that bin, with the aim of allowing each bin to achieve a similar dynamic range over time, which helps mitigate against the influence of spectral roll-off and strongly-varying dynamics. Adaptive whitening requires no training, is relatively lightweight to compute, and can run in real-time. Yet it can improve onset detector performance by more than ten percentage points (peak F-measure) in some cases, and improves the performance of most of the onset detectors tested.

We present results demonstrating that adaptive whitening can significantly improve the performance of various STFT-based onset detection functions, including functions based on the power, spectral flux, phase deviation, and complex deviation measures. Our results find the process to be especially beneficial for certain types of audio signal (e.g. complex mixtures such as pop music). 
},
}

@book{morrison1994ifa,
	Author = {Morrison, N.},
	Keywords = {Fourier transform},
	Publisher = {Wiley},
	Title = {Introduction to {F}ourier {A}nalysis},
	Year = {1994},
}

@inproceedings{Magnusson:2007,
	Author = {Magnusson, T. and Mendieta, E. H.},
	Booktitle = {Proceedings of {N}ew {I}nterfaces for {M}usical {E}xpression ({NIME})},
	Keywords = {Survey, musical instruments, usability, ergonomics, embodiment, mapping, affordances, constraints, instrumental entropy, audio programming, latency, expressive interfaces},
	Month = {June},
	Title = {The Acoustic, the Digital and the Body: A Survey on Musical Instruments},
	Url = {http://www.ixi-audio.net/survey/},
	Year = {2007},
	Abstract = {This paper reports on a survey conducted in the autumn of 2006 
with the objective to understand people's relationship to their 
musical tools. The survey focused on the question of embodiment 
and its different modalities in the fields of acoustic and digital 
instruments. The questions of control, instrumental entropy, 
limitations and creativity were addressed in relation to people's 
activities of playing, creating or modifying their instruments. The 
approach used in the survey was phenomenological, i.e. we were 
concerned with the experience of playing, composing for and 
designing digital or acoustic instruments. At the time of analysis, 
we had 209 replies from musicians, composers, engineers, 
designers, artists and others interested in this topic. The survey 
was mainly aimed at instrumentalists and people who create their 
own instruments or compositions in flexible audio programming 
environments such as SuperCollider, Pure Data, ChucK, 
Max/MSP, CSound, etc.   
},
}

@article{keeler1972ats,
	Author = {Keeler, J.},
	Journal = {Audio and Electroacoustics, IEEE Transactions on},
	Keywords = {harmonics},
	Number = {5},
	Pages = {378--391},
	Title = {The attack transients of some organ pipes},
	Url = {http://ieeexplore.ieee.org/iel6/8337/26094/01162411.pdf?tp=&isnumber=&arnumber=1162411},
	Volume = {20},
	Year = {1972},
	Abstract = {Using piecewise-periodic techniques, the  attack  transients of almost 200 different  organ pipes from 33 ranks  have  been analyzed. Results  for 44 pipes representing 10 ranks  are  discussed in  detail  in  this paper. In addition to  the  steady-state  parameters of frequency and amplitude, transient duration,  overshoot,  delay and stability are used to describe the  attack  transients,  and  it  is shown how each pipe, rank,  and family differs from  the  others  in  these respects. Norms for the four families,  flutes,  foundations,  strings, and  reeds,  are proposed. 

Recognizing the importance of attack  transients  in characterizing musical  sounds,  these  results provide a unique aid in  the design of instruments, traditional or electronic, that  generate pipe organ sounds.  The  technique of piecewise-periodic analysis can, in addition, be  used  on  many  other musical instruments  and  synthesizers as  an aid in  their design, manufacture,  and operation. 
},
}

@article{Jensen:2002,
	Author = {Jensen, K.},
	Journal = {Journal of the ITC-SRA},
	Title = {Perceptual and Physical Aspects of Musical Sounds},
	Url = {http://www.cs.aaue.dk/~krist/papers/sangeet02-ppaoms.pdf},
	Volume = {16},
	Year = {2002},
}

@inproceedings{tsai2004agp,
	Author = {Tsai, C. G.},
	Booktitle = {Proceedings of the International Symposium on Musical Acoustics},
	Keywords = {harmonics, subharmonics, perception},
	Title = {Auditory Grouping in the Perception of Roughness Induced by Subharmonics: Empirical Findings and a Qualitative Model},
	Url = {http://homepage.ntu.edu.tw/~gim/gia/pub/grouping_sub4.pdf},
	Year = {2004},
	Abstract = {Quasi-periodic sounds with subharmonics at (2n-1)f0/2 (where f0 is the perceived pitch, n = 1, 2, 3...) can be produced by musical instruments such as the saxophone, the trombone, the violin, and the Chinese membrane flute. Lower subharmonics in a natural sound are always too weak to evoke the pitch f0/2, but upper subharmonics (>11f0/2) can be strong enough to affect the sound quality. Subharmonics are common in human vocalizations and have been identified as a source of roughness. However, this type of roughness cannot be explained by existing psychoacoustic models and appears to contradict the theory of consonancedissonance. The present study provided a qualitative model of roughness induced by subharmonics with the consideration of higher-order mechanisms of auditory grouping. The key assumption was that interference between components at nf0 lying in the same critical bands would be largely reduced once they are grouped by a robust pitch sensation of f0. Roughness induced by subharmonics reflects a limitation of the pitch-based grouping mechanism, as the perceived pitch is too high for grouping the subharmonics.},
}

@inproceedings{Tsai:2004,
	Author = {Tsai, C. G.},
	Booktitle = {Proceedings of the International Symposium on Musical Acoustics},
	Keywords = {harmonics, perception},
	Title = {Helmholtz's Nasality Revisited: Physics and Perception of Sounds with Predominance of Upper Odd-numbered Harmonics},
	Url = {http://homepage.ntu.edu.tw/~gim/gia/pub/nasality5.pdf},
	Year = {2004},
	Abstract = {The sound qualities associated with the predominance of odd-numbered harmonics were first described by Helmholtz. He reported that if only the odd-numbered harmonics were present, the quality of tone was hollow; when a large number of such upper harmonics were present, nasal. Helmholtz's nasality implies a special type of nasal voices, which is produced when a mucous membrane is formed in the nasal cavity and characterized by the predominance of odd-numbered harmonics in the range of 2-5 kHz. This predominance can also be found in some tones of the Chinese membrane flute (dizi). I suggest that the physics underlying Helmholtz's nasality is a quasi-sinusoidally driven membrane with cubic nonlinearity. In the study of timbre perception, the pattern of jagged spectra in hollow or nasal sounds is analogized to the pattern of luminance-varying gratings in visual sensation. Further, the predominance of upper odd-numbered harmonics can elicit additional pitches beside the fundamental frequency f0. This multi-pitch effect in dizi music is quantitatively studied in terms of autocorrelation analysis. 

},
}

@article{Caclin:2005,
	Author = {Caclin, A. and McAdams, S. and Smith, B. K. and Winsberg, S.},
	Doi = {10.1121/1.1929229},
	Journal = {The Journal of the Acoustical Society of America},
	Keywords = {hearing, musical acoustics, loudness, acoustic correlation, harmonics, odd-vs-even},
	Number = {1},
	Pages = {471--482},
	Publisher = {ASA},
	Title = {Acoustic correlates of timbre space dimensions: A confirmatory study using synthetic tones},
	Url = {http://link.aip.org/link/?JAS/118/471/1},
	Volume = {118},
	Year = {2005},
	Abstract = {Timbre spaces represent the organization of perceptual distances, as measured with dissimilarity 
ratings, among tones equated for pitch, loudness, and perceived duration. A number of potential 
acoustic correlates of timbre-space dimensions have been proposed in the psychoacoustic literature, 
including attack time, spectral centroid, spectral flux, and spectrum fine structure. The experiments 
reported here were designed as direct tests of the perceptual relevance of these acoustical parameters 
for timbre dissimilarity judgments. Listeners presented with carefully controlled synthetic tones use 
attack time, spectral centroid, and spectrum fine structure in dissimilarity rating experiments. These 
parameters thus appear as major determinants of timbre. However, spectral flux appears as a less 
salient timbre parameter, its salience depending on the number of other dimensions varying 
concurrently in the stimulus set. Dissimilarity ratings were analyzed with two different 
multidimensional scaling models  CLASCAL and CONSCAL , the latter providing psychophysical 
functions constrained by the physical parameters. Their complementarity is discussed.},
}

@article{Roberts:1998,
	Author = {Roberts, B.},
	Doi = {10.1121/1.423086},
	Journal = {The Journal of the Acoustical Society of America},
	Keywords = {hearing},
	Number = {6},
	Pages = {3588-3596},
	Publisher = {ASA},
	Title = {Effects of spectral pattern on the perceptual salience of partials in harmonic and frequency-shifted complex tones: A performance measure},
	Url = {http://link.aip.org/link/?JAS/103/3588/1},
	Volume = {103},
	Year = {1998},
	Abstract = {A single even harmonic added to an odd-harmonic complex is often judged to be more salient than its odd neighbors in a clarity rating task [Roberts and Bregman, J. Acoust. Soc. Am. 90, 3050--3060 (1991)]. This study used similar complexes in a two-interval forced-choice procedure. Each interval consisted of a complex tone followed by a pure tone, whose frequency matched that of a harmonic in one interval but was changed by $\pm$0.5 × fundamental frequency in the other. Subjects were asked to identify the matching interval. Since the pure tone followed the complex tone, it could not cue listening to a particular frequency region. The possibility of cross-interval cuing was reduced by changing the fundamental frequency between intervals (100--150 Hz range). The procedure was designed to maximize the effects on performance of differences in immediate perceptual salience between the partials. The added even harmonic was typically judged with greater accuracy than its odd neighbors (experiment 1), though this effect was greatly reduced for harmonics above 8 (experiment 2). The even--odd difference persisted when the original stimuli were made inharmonic by applying a frequency shift of 15%, but was abolished for stimuli consisting of successive partials (experiment 3).},
}

@article{Roberts:1996,
	Author = {Roberts, B. and Bailey, P. J.},
	Journal = {Perception {\&} Psychophysics},
	Keywords = {harmonics, perception},
	Number = {2},
	Pages = {289--299},
	Title = {Regularity of spectral pattern and its effects on the perceptual fusion of harmonics},
	Url = {http://www.aston.ac.uk/downloads/lhs/robertsb/08_Roberts&Bailey1996a.pdf},
	Volume = {58},
	Year = {1996},
	Abstract = {A single even harmonic added to an odd-harmonic complex may be judged as perceptually more 
salient than the odd harmonics themselves (Roberts {\&} Bregman, 1991). It is proposed that this effect 
occurs because the even harmonic is inconsistent with the regular spectral pattern formed by the 
odd-harmonic complex (the base). Therefore, a reduction in the regularity of the base spectrum 
should reduce the even--odd difference. Spectral regularity was reduced either by removing base 
components, or by including components in the base that were inconsistent with its original pattern. 
Subjects listened to (primarily) harmonic complex tones and rated the clarity of one of the harmon- 
ics, cued by a preceding pure tone. Both removing components from the base and including extra- 
neous components in the base reduced the even--odd difference. The results suggest that it is easier 
to segregate a harmonic from a periodic complex tone when it does not form part of the regular pat- 
tern of spectral spacing defined by the other harmonics. 
},
}

@inproceedings{jelinek1999fds,
	Author = {Jelinek, M. and Adoul, J.P.},
	Booktitle = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing (ICASSP'99)},
	Doi = {10.1109/ICASSP.1999.758110},
	Keywords = {harmonics},
	Title = {Frequency-domain spectral envelope estimation for low rate coding of speech},
	Url = {http://www.gel.usherbrooke.ca/audio/docs/ICASSP1999_Jelinek.pdf},
	Year = {1999},
	Abstract = {Estimation of the spectral envelope in the frequency domain allows to avoid some problems of linear prediction (LP) algorithms for voiced speech. We present a low complexity method of spectral envelope estimation from harmonics for low rate coding. The method consists in computing the harmonic amplitude spectrum using pitch-synchronous DFT with length depending on voicing, modifying this spectrum outside the telephone bandwidth to simplify modeling of the useful bandwidth and interpolating it by a frequency-domain low-pass filter. An all-pole model is then fitted to this modified smoothed version of the harmonic spectrum. The method was implemented on the harmonic-stochastic excitation (HSX) vocoder and the performance was compared with the LP algorithm similar to that used in the G.729 speech coding standard. A-B comparative tests show an important increase in perceptual quality},
}

@inproceedings{Kirkpatrick:2006,
	Author = {Kirkpatrick, B. and O'Brien, D. and Scaife, R.},
	Booktitle = {Irish Signals and Systems Conference},
	Keywords = {harmonic cepstrum, speech recognition},
	Month = {June},
	Pages = {515--520},
	Publisher = {IET},
	Title = {A comparison of spectral continuity measures as a join cost in concatenative speech synthesis},
	Url = {http://www.computing.dcu.ie/~bkirkpatrick/my_papers/Kirkpatrick_ISSC_06.pdf},
	Year = {2006},
	Abstract = {The quality of concatenative speech synthesis depends 
on the cost function employed for unit selection. Cost functions for 
spectral continuity are difficult to define and standard measures used 
for this task often do not accurately reflect human perception of dis- 
continuity across a concatenated join. We compare a set of standard 
distance measures for the task of detecting audible discontinuities 
and introduce a new measure. A perceptual experiment is described 
that was used to relate each measure to human perception of dis- 
continuities. The impact of window length on feature extraction and 
subsequent detection of discontinuities is investigated. The distance 
measure approach to detecting audible discontinuities is extended 
to a feature space based representation and feature transformations 
are investigated as a means of improving discontinuity detection. Re- 
ceiver Operating Characteristic (ROC) curves are used to compare 
the results, which indicate that the feature space approach improves 
on the performance of standard measures. 
},
}

@inproceedings{Gu:2001,
	Author = {Gu, L. and Rose, K.},
	Booktitle = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing (ICASSP'01)},
	Keywords = {harmonic cepstrum, harmonics, antiMFCC},
	Title = {Perceptual harmonic cepstral coefficients for speech recognition in noisy environment},
	Url = {http://mti.xidian.edu.cn/multimedia/2001/supp/icassp2001/MAIN/papers/pap1068.pdf},
	Year = {2001},
	Abstract = {Perceptual harmonic cepstral coefficients (PHCC) are proposed 
as features to extract from speech for recognition in noisy 
environments. A weighting function, which depends on the 
prominence of the harmonic structure, is applied to the power 
spectrum to ensure accurate representation of the voiced speech 
spectral envelope. The harmonics weighted power spectrum 
undergoes mel-scaled band-pass filtering, and the log-energy of 
the filters' output is discrete cosine transformed to produce 
cepstral coefficients. Lower spectral clipping is applied to the 
power spectrum, followed by within-filter root-power amplitude 
compression to reduce amplitude variation without compromise 
of the gain invariance properties. Experiments show significant 
recognition gains of PHCC over MFCC, with 23% and 36% 
error rate reduction for the Mandarin digit database in white and 
babble noise environments. 
},
}

@misc{Rose:2004,
	Author = {Rose, K. and Gu, L.},
	Howpublished = {Patent},
	Keywords = {harmonic cepstrum, harmonics},
	Title = {Perceptual harmonic cepstral coefficients as the front-end for speech recognition},
	Url = {http://www.wipo.int/pctdb/en/wo.jsp?wo=2002029782},
	Year = {2002},
	Abstract = {Pitch estimation and classification into voiced, unvoiced and transitional speech were performed by a spectro-temporal auto-correlation technique. A peak picking formula was then employed. A weighting function was then applied to the power spectrum. The harmonics weighted power spectrum underwent mel-scaled band-pass filtering, and the log-energy of the filter's output was discrete cosine transformed to produce cepstral coefficients. A within-filter cubic-root amplitude compression was applied to reduce amplitude variation without compromise of the gain invariance properties.},
}

@inproceedings{gu2000phc,
	Author = {Gu, L. and Rose, K.},
	Booktitle = {Proceedings of the International Conference on Spoken Language Processing (ICSLP 2000)},
	Keywords = {harmonics, harmonic cepstrum, antiMFCC},
	Pages = {309--312},
	Title = {Perceptual harmonic cepstral coefficients as the front-end for speech recognition},
	Url = {http://www.ece.ucsb.edu/scl/pubs/pubs_B/b00_4.pdf},
	Volume = {1},
	Year = {2000},
	Abstract = {
Perceptual harmonic cepstral coefficients (PHCC) are proposed
as features to extract for speech recognition. Pitch estimation
and classification into voiced, unvoiced, and transitional speech
are performed by a spectro-temporal auto-correlation technique.
A peak picking algorithm is then employed to precisely locate
pitch harmonics. A weighting function, which depends on the
classification and the pitch harmonics, is applied to the power
spectrum and ensures accurate representation of the voiced
speech spectral envelope. The harmonics weighted power
spectrum undergoes mel-scaled band-pass filtering, and the log-
energy of the filters' output is discrete cosine transformed to
produce cepstral coefficients. For perceptual considerations,
within-filter cubic-root amplitude compression is applied to
reduce amplitude variation without compromise of the gain
invariance properties. Experiments show substantial recognition
gains of PHCC over MFCC, with 48% and 15% error rate
reduction for the Mandarin digit database and E-set,
respectively. 
},
}

@article{zhang2005sms,
	Author = {Zhang, Y.G. and Zhang, C.S.},
	Journal = {Advances in Neural Information Processing Systems},
	Pages = {1619--1626},
	Title = {Separation of Music Signals by Harmonic Structure Modeling},
	Url = {http://books.nips.cc/papers/files/nips18/NIPS2005_0184.pdf},
	Volume = {18},
	Year = {2005},
	Abstract = {Separation of music signals is an interesting but difficult problem. It is 
helpful for many other music researches such as audio content analysis. 
In this paper, a new music signal separation method is proposed, which is 
based on harmonic structure modeling. The main idea of harmonic struc- 
ture modeling is that the harmonic structure of a music signal is stable, 
so a music signal can be represented by a harmonic structure model. Ac- 
cordingly, a corresponding separation algorithm is proposed. The main 
idea is to learn a harmonic structure model for each music signal in the 
mixture, and then separate signals by using these models to distinguish 
harmonic structures of different signals. Experimental results show that 
the algorithm can separate signals and obtain not only a very high Signal- 
to-Noise Ratio (SNR) but also a rather good subjective audio quality. 
},
}

@inproceedings{Minematsu:2007,
	Author = {Minematsu, Nobuaki and Maruyama, Kazutaka and Sakuraba, Kyoko and Hirose, Keikichi and Tayama, Niro and Imaizumi, Satoshi and Yamauchi, Toshio},
	Booktitle = {ICASSP 2007},
	Pages = {297--300},
	Title = {DEVELOPMENT OF A FEMININITY ESTIMATOR USING SPEAKER RECOGNITION TECHNIQUES FOR VOICE THERAPY OF GENDER IDENTITY DISORDER CLIENTS},
	Year = {2007},
	Abstract = {This paper describes the development of an estimator of perceptual femininity (PF) of an input utterance using speaker recognition techniques. The estimator was designed for clinical use and the target speakers are Gender Identity Disorder clients, especially MtF (Male to Female) transsexuals. GMMs of F0 values and spectrums were built separately for biologically male speakers and female ones. Using the four models, PF was estimated automatically for each of 142 utterances of 111 MtFs. The estimated values were compared with the PF values obtained originally through listening tests. Results showed very high correlation (R=0.86), which is comparable to the intra-rater correlation.}}

@inproceedings{Sundaram:2007,
	Author = {Sundaram, S. and Narayanan, S.},
	Booktitle = {ICASSP 2007},
	Keywords = {antiMFCC},
	Pages = {213--216},
	Title = {DISCRIMINATING TWO TYPES OF NOISE SOURCES USING CORTICAL REPRESENTATION AND DIMENSION REDUCTION TECHNIQUE},
	Year = {2007},
	Abstract = {Content-based audio classification techniques have focused on classifying events that are both semantically and perceptually distinct (such as speech, music, environmental sounds etc.). However, it is both useful and challenging to develop systems that can also discern sources that are semantically and perceptually close. In this paper we present results of our experiments on discriminating two types of noise sources. Particularly, we focus on machine-generated versus natural noise sources. A bio-inspired tensor representation of audio that models the processing at the primary auditory cortex is used for feature extraction. To handle large tensor feature sets, we use a generalized discriminant analysis method to reduce the dimension. We also present a novel technique of partitioning data into smaller subsets and combining the results of individual analysis before training pattern classifiers. The results of the classification experiments indicate that cortical representation performs 25% better than the common perceptual feature set used in audio classification systems (MFCCs).},
}

@inproceedings{Goncalves:2007,
	Author = {Goncalves, P. and Abry, P. and Rilling, G. and Flandrin, P.},
	Booktitle = {ICASSP 2007},
	Pages = {1153-1156},
	Title = {FRACTAL DIMENSION ESTIMATION:EMPIRICAL MODE DECOMPOSITION VERSUS WAVELETS},
	Year = {2007},
	Abstract = {We address the problem of fractal dimension estimation of a discrete sample path. After recalling the multiplicity of possible definitions, we focus on the regularity dimension and on the regularization dimension, and report on the common ingredients that underlie these definitions: a scale transform of the signal, and a geometric or statistical measure on the scaled signal. Then, we propose to interchange wavelet transforms, ordinarily used as the scale transform, with empirical mode decomposition (EMD), a recently proposed signal-adaptive transform. The adaptivity of this latter yields estimation performance that overhauls usual wavelet-based techniques. To support our claim, we obtain comprehensive results from a Monte Carlo simulation on fractional Brownian motions.},
}

@inproceedings{Nielsen:2007,
	Author = {Nielsen, A. B. and Sigurdsson, S. and Hansen, L. K. and Arenas-Garc{\'\i}a, J.},
	Booktitle = {ICASSP 2007},
	Keywords = {MFCC, harmonics},
	Pages = {485--488},
	Title = {ON THE RELEVANCE OF SPECTRAL FEATURES FOR INSTRUMENT CLASSIFICATION},
	Year = {2007},
	Abstract = {Automatic knowledge extraction from music signals is a key component for most music organization and music information retrieval systems. In this paper, we consider the problem of instrument modelling and instrument classification from the rough audio data. Existing systems for automatic instrument classification operate normally on a relatively large number of features, from which those related to the spectrum of the audio signal are particularly relevant. In this paper, we confront two different models about the spectral characterization of musical instruments. The first assumes a constant envelope of the spectrum (i.e., independent from the pitch), whereas the second assumes a constant relation among the amplitude of the harmonics. The first model is related to the Mel Frequency Cepstrum Coefficients (MFCCs), while the second leads to what we will refer to as Harmonic Representation (HR). Experiments on a large database of real instrument recordings show that the first model offers a more satisfactory characterization, and therefore MFCCs should be preferred to HR for instrument modelling/classification.},
}

@inproceedings{Ryynanen:2007,
	Author = {Ryyn{\"a}nen, M. and Klapuri, A.},
	Booktitle = {ICASSP 2007},
	Pages = {1437--1440},
	Title = {AUTOMATIC BASS LINE TRANSCRIPTION FROM STREAMING POLYPHONIC AUDIO},
	Year = {2007},
	Abstract = {This paper proposes a method for the automatic transcription of the bass line in polyphonic music. The method uses a multiple-F0 estimator as a front-end and this is followed by acoustic and musicological models. The acoustic modeling consists of separate models for bass notes and rests. The musicological model estimates the key and determines probabilities for the transitions between notes using a conventional bigram or a variable-order Markov model. The transcription is obtained with Viterbi decoding through the note and rest models. In addition, a causal algorithm is presented which allows transcription of streaming audio. The method was evaluated using 87 minutes of music from the RWC Popular Music Database. Recall and precision rates of 64% and 60%, respectively, were achieved for discrete note events.}}

@article{cariolaro2002fdc,
	Author = {Cariolaro, G. and Erseghe, T. and Kraniauskas, P.},
	Journal = {Signal Processing, IEEE Transactions on [see also Acoustics, Speech, and Signal Processing, IEEE Transactions on]},
	Number = {4},
	Pages = {902--911},
	Title = {The fractional discrete cosine transform},
	Url = {http://ieeexplore.ieee.org/iel5/78/21392/00992138.pdf},
	Volume = {50},
	Year = {2002},
	Abstract = {The extension of the Fourier transform operator to 
a fractional power has received much attention in signal theory 
and is finding attractive applications. The paper introduces and de- 
velops the fractional discrete cosine transform (DCT) on the same 
lines, discussing multiplicity and computational aspects. Similar- 
ities and differences with respect to the fractional Fourier trans- 
form are pointed out. },
}

@manual{Sinclair:1985,
	Author = {Sinclair},
	Keywords = {AY, soundchip},
	Title = {Sinclair Spectrum 128 Service Manual},
	Url = {\url{ftp://ftp.worldofspectrum.org/pub/sinclair/technical-docs/ZXSpectrum128K_TechnicalManual.pdf}},
	Year = {1985},
	Abstract = {1.1 The Spectrum 128 is a derivative of the 48K Spectrum Plus 
offering 128K of RAM, music quality sound, greatly improved 
video quality and higher hardware reliability. 

1.2 The firmware is capable of running in Spectrum 48K mode or, 
alternatively in 128K mode, which will support paged memory in 
the form of a RAM disk. Extended BASIC to handle the sound 
facility is provded, and a full screen editor is incorporated 
in the firmware. 

1.3 A list of the principle features appears below: 
(a) 128K dynamic RAM 
(b) 32K ROM 
(c) Numeric keypad 
(d) TV sound with composite video 
(e) Elimination of dot crawl (single crystal operation) 
(f) RGB output 
(g) RS232 serial port 
(h) Musical instrument digital interface (MIDI) 
(j) Software compatible with all previous Spectrums 
(k) Edge connector compatible with Spectrum.},
}

@article{Howard:2001,
	Author = {Howard, D. M. and Tyrrell, A. M.},
	Doi = {10.1017/S1355771897009011},
	Journal = {Organised Sound},
	Number = {02},
	Pages = {65--76},
	Publisher = {Cambridge Univ Press},
	Title = {Psychoacoustically informed spectrography and timbre},
	Url = {http://www.journals.cambridge.org/action/displayFulltext?type=1&fid=76420&jid=&volumeId=&issueId=02&aid=76419},
	Volume = {2},
	Year = {2001},
	Abstract = {Pitch and loudness are subjective aspects of sound which 
can be described in terms of the observed abilities of 
subjects to rate them on a scale from `low' to `high'. Timbre
is a subjective aspect of sound for which there is no such 
scale and neither qualitative nor quantitative descriptions 
are generally found that are widely accepted. The purpose 
of this paper is to shed light on some frequency domain 
aspects of the nature of timbre by making use of the results
obtained from an analysis system which is designed to take
advantage of contemporary psychoacoustical knowledge 
relating to human peripheral hearing. Results are presented
which illustrate the relationship between contemporary 
psychoacoustic ideas relating to timbre and ideas first 
discussed by Helmholtz and later taken up by other 
researchers. Analyses by the system of a selection of sounds
from acoustic musical instruments with clear timbral 
differences are also presented in order to place these 
discussions in a musical context. 
},
}

@book{ANSI:1960,
	Address = {New York},
	Author = {{ANSI}},
	Number = {S1.1-1960},
	Publisher = {{A}merican {N}ational {S}tandards {I}nstitute},
	Title = {Acoustical terminology},
	Year = {1960}}

@article{Bass:1985,
	Author = {Bass, P.},
	Journal = {Antic},
	Keywords = {AY, soundchip},
	Month = {Nov},
	Number = {7},
	Pages = {16},
	Title = {{ST} Sound: Hearing the {AY-3-8910} chip},
	Url = {http://www.atarimagazines.com/v4n7/stsound.html},
	Volume = {4},
	Year = {1985},
}

@manual{General-Instrument:1980,
	Author = {{General Instrument}},
	Keywords = {AY, soundchip},
	Title = {{GI AY-3-8910} {P}rogrammable {S}ound {G}enerator datasheet},
	Year = {early 1980s}}

@book{Takayasu:1990,
	Author = {Takayasu, H.},
	Keywords = {fractals, fractal dimension},
	Publisher = {Manchester University Press},
	Title = {Fractals in the Physical Sciences},
	Year = {1990},
}

@book{duda2000pc,
	Author = {Duda, R.O. and Hart, P.E. and Stork, D.G.},
	Edition = {2nd},
	Keywords = {ICA, source separation, self-organizing map},
	Publisher = {Wiley-Interscience},
	Title = {{Pattern Classification}},
	Year = {2000},
}

@article{Maragos:1999,
	Author = {Maragos, P. and Potamianos, A.},
	Doi = {10.1121/1.426738},
	Journal = {The Journal of the Acoustical Society of America},
	Keywords = {fractals},
	Pages = {1925},
	Publisher = {ASA},
	Title = {Fractal dimensions of speech sounds: Computation and application to automatic speech recognition},
	Url = {http://cvsp.cs.ntua.gr/publications/jpubl+bchap/MaragosPotamianos_SpeecFrDimRecogn_JASA1999.pdf},
	Volume = {105},
	Year = {1999},
	Abstract = {The dynamics of airflow during speech production may often result in some small or large degree of turbulence. In this paper, the geometry of speech turbulence as reflected in the fragmentation of the time signal is quantified by using fractal models. An efficient algorithm for estimating the short-time fractal dimension of speech signals based on multiscale morphological filtering is described, and its potential for speech segmentation and phonetic classification discussed. Also reported are experimental results on using the short-time fractal dimension of speech signals at multiple scales as additional features in an automatic speech-recognition system using hidden Markov models, which provide a modest improvement in speech-recognition performance.},
}

@article{PhysRevA.39.1500,
	Author = {Dubuc, B. and Quiniou, J. F. and Roques-Carmes, C. and Tricot, C. and Zucker, S. W.},
	Doi = {10.1103/PhysRevA.39.1500},
	Journal = {Phys. Rev. A},
	Keywords = {fractals},
	Month = {Feb},
	Number = {3},
	Numpages = {12},
	Pages = {1500--1512},
	Publisher = {American Physical Society},
	Title = {Evaluating the fractal dimension of profiles},
	Url = {http://prola.aps.org/pdf/PRA/v39/i3/p1500_1},
	Volume = {39},
	Year = {1989},
	Abstract = {There are many definitions of the fractal dimension of an object, including box dimension, Bouligand-Minkowski dimension, and intersection dimension. Although they are all equivalent in the continuous domain, they differ substantially when discretized and applied to digitized data. We show that the standard implementations of these definitions on self-affine curves with known fractal dimension (Weierstrass-Mandelbrot, Kiesswetter, fractional Brownian motion) yield results with significant errors. An analysis of the source of these errors leads to a new algorithm in one dimension, called the variation method, which yields accurate results. The variation method uses the notion of ε oscillation to measure the amplitude of the one-dimensional function in an ε neighborhood. The order of growth of the integral of the ε oscillation (called the ε variation), as ε tends toward zero, is directly related to the fractal dimension. In this paper, we present the variation method for one-dimensional (1D) profiles and show that, in the limit, it is equivalent to the classical box-counting method. The result is an algorithm for reliably estimating the fractal dimension of 1D profiles; i.e., graphs of functions of a single variable. The algorithm is tested on profiles with known fractal dimension.},
}

@article{soille-rivest96,
	Author = {Soille, P. and Rivest, J.-F.},
	Journal = {Journal of Visual Communication and Image Rep- resentation},
	Keywords = {fractals},
	Month = September,
	Number = {3},
	Pages = {217--229},
	Title = {On the validity of fractal dimension measurements in image analysis},
	Url = {http://ams.jrc.it/soille/soille-rivest96.pdf},
	Volume = {7},
	Year = {1996},
	Abstract = {Fractal dimension is a parameter frequently used to analyze textures at different scales. There are several alternative definitions of the fractal dimension and consequently many algorithms have been proposed to determine its value. In this paper, we assess the robustness of all these algorithms. This is achieved by comparing their respective behaviour under linear transformations of the image intensity values. The estimated fractal dimension should be invariant to these transformations. We show that reliable algorithms are restricted to Flat Structuring Element, Variogram, and Power Spectrum methods. Experiments are performed on simulated and natural images. 
},
}

@article{Story:2001,
	Author = {Story, B. H. and Titze, I. R. and Hoffman, E. A.},
	Doi = {10.1121/1.1352085},
	Journal = {The Journal of the Acoustical Society of America},
	Month = {April},
	Number = {4},
	Pages = {1651-1667},
	Title = {The relationship of vocal tract shape to three voice qualities},
	Volume = {109},
	Year = {2001},
	Abstract = {Three-dimensional vocal tract shapes and consequent area functions representing the vowels [, {\ae}, , ] have been obtained from one male and one female speaker using magnetic resonance imaging (MRI). The two speakers were trained vocal performers and both were adept at manipulation of vocal tract shape to alter voice quality. Each vowel was performed three times, each with one of the three voice qualities: normal, yawny, and twangy. The purpose of the study was to determine some ways in which the vocal tract shape can be manipulated to alter voice quality while retaining a desired phonetic quality. To summarize any overall tract shaping tendencies mean area functions were subsequently computed across the four vowels produced within each specific voice quality. Relative to normal speech, both the vowel area functions and mean area functions showed, in general, that the oral cavity is widened and tract length increased for the yawny productions. The twangy vowels were characterized by shortened tract length, widened lip opening, and a slightly constricted oral cavity. The resulting acoustic characteristics of these articulatory alterations consisted of the first two formants (F1 and F2) being close together for all yawny vowels and far apart for all the twangy vowels.},
}

@inproceedings{Jiang:2002,
	Author = {Jiang, D.-N. and Lu, L. and Zhang, H.-J. and Tao, J.-H. and Cai, L.-H.},
	Booktitle = {Proceedings of the International Conference on Multimedia and Expo (ICME '02)},
	Doi = {10.1109/ICME.2002.1035731},
	Keywords = {feature extraction; multimedia databases; music; pattern classification; spectral analysis; automatic music type classification; digital music databases; music clip; octave-based spectral contrast feature; relative spectral distribution; spectral characteristics representation},
	Pages = {113--116},
	Title = {Music type classification by spectral contrast feature},
	Ty = {CONF},
	Volume = {1},
	Year = {2002},
	Abstract = {Automatic music type classification is very helpful for the management of digital music databases. In this paper, the octave-based spectral contrast feature is proposed to represent the spectral characteristics of a music clip. It represented the relative spectral distribution instead of average spectral envelope. Experiments show that the octave-based spectral contrast feature performs well in music type classification. Another comparison experiment demonstrates that the octave-based spectral contrast feature has a better discrimination among different music types than mel-frequency cepstral coefficients (MFCC), which is often used in previous music type classification systems.},
}

@inproceedings{Allamanche:2003,
	Author = {Allamanche, E. and Herre, J. and Hellmuth, O. and Kastner, T. and Ertel, C.},
	Booktitle = {Proc. Int. Symposium on Music Information Retrieval (ISMIR)},
	Title = {A multiple feature model for musical similarity retrieval},
	Url = {http://ismir2003.ismir.net/papers/Allamanche.pdf},
	Year = {2003},
	Abstract = {Despite the ``fuzzy'' nature of musical similarity, 
which varies from one person to another, perceptual 
low level features combined with appropriate classi- 
fication schemes have proven to perform satisfacto- 
rily for this task. Since a single feature only captures 
some selective characteristics of an audio signal, this 
information may, in some cases, not be sufficient to 
properly identify similarities between songs. This pa- 
per presents a system which combines a set of acous- 
tic features for the task of retrieving similar sounding 
songs. The methodology for optimum feature selec- 
tion and combination is explained, and the system's 
performance is assessed by means of a subjective lis- 
tening test. },
}

@inproceedings{Herre:2001,
	Author = {Herre, J. and Allamanche, E. and Hellmuth, O.},
	Booktitle = {Workshop on the Applications of Signal Processing to Audio and Acoustics (WASPAA-2001)},
	Keywords = {audio signal processing; feature extraction; pattern matching; spectral analysis; audio signal identification; audio signal matching; content-related techniques; multimedia data; signal distortions; spectral flatness features},
	Pages = {127--130},
	Title = {Robust matching of audio signals using spectral flatness features},
	Year = {2001},
	Abstract = {Stimulated by the ever-increasing amount of available multimedia data, content-related techniques for the management of audio material have received much interest recently. This paper discusses the problem of robust identification of audio signals by matching them to a known reference. In order to perform well under realworld conditions, the matching process needs to rely on features which are robust with respect to common signal distortions. A family of suitable features with favorable properties is proposed and evaluated for their recognition performance. Applications of signal matching, including fingerprinting, are discussed.},
}

@inproceedings{Tyagi:2005,
	Author = {Tyagi, V. and Wellekens, C.},
	Booktitle = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing (ICASSP '05)},
	Doi = {10.1109/ICASSP.2005.1415167},
	Isbn = {1520-6149},
	Keywords = {MFCC, MFCC extraction algorithm, MFCC extraction method},
	Pages = {529--532},
	Title = {On desensitizing the {M}el-Cepstrum to spurious spectral components for Robust Speech Recognition},
	Volume = {1},
	Year = {2005},
	Abstract = {It is well known that the peaks in log Mel-filter bank 
spectrum are important cues in characterizing the speech 
sounds. However, low energy perturbations in the power 
spectrum may become numerically significant after the log 
compression. We show that even if the spectral peaks are 
kept constant, the low energy perturbations in the power 
spectrum can create huge variations in the cepstral coeffi- 
cients. We show, both analytically and experimentally, that 
exponentiating the log Mel-filter bank spectrum before the 
cepstrum computation can significantly reduce the sensitiv- 
ity of the cepstra to spurious low energy perturbations. Mel- 
cepstrum modulation spectrum [3] is computed from the 
processed cepstra which results in further noise robustness 
of the composite feature vector. In experiments with speech 
signals, it is shown that the proposed technique based fea- 
tures yield a significant increase in speech recognition per- 
formance in non-stationary noise conditions when compared 
directly to the MFCC and RASTA-PLP features.},
}

@inproceedings{Seo:2005,
	Author = {Seo, J. S. and Jin, M. and Lee, S. and Jang, D. and Lee, S. and Yoo, C. D.},
	Booktitle = {Acoustics, Speech, and Signal Processing, 2005. Proceedings. (ICASSP '05). IEEE International Conference on},
	Isbn = {1520-6149},
	Keywords = {audio signal processing; feature extraction; multimedia systems; reliability; spectral analysis; stability; MP3 compression; audio fingerprinting; audio processing; equalization; linear speed change; multimedia fingerprinting; noise addition; normalized spectral subband centroids; random start; relevant feature extraction; time-scale modification},
	Pages = {iii/213--iii/216},
	Title = {Audio fingerprinting based on normalized spectral subband centroids},
	Url = {http://ieeexplore.ieee.org/iel5/9711/30652/01415684.pdf?tp=&isnumber=&arnumber=1415684},
	Volume = {3},
	Year = {2005},
	Abstract = {For multimedia fingerprinting, it is crucial to extract relevant features that allow direct access to the distinguishing characteristics of a multimedia object. Features used for fingerprinting directly relate to the performance of the entire fingerprinting system. The paper proposes a novel audio fingerprinting method based on normalized spectral subband centroids. The spectral subband centroid is selected due to its resilience against equalization, compression, and noise addition. Both reliability and robustness issues in the fingerprinting system are addressed. Experimental results show that the proposed method is not only reliable, but also robust against various audio processing steps, including MP3 compression, equalization, random start, time-scale modification, and linear speed change.},
}

@article{Chen:2004,
	Author = {Chen, Jingdong and Huang, Yiteng and Li, Qi and Paliwal, K. K.},
	Doi = {10.1109/LSP.2003.821689},
	Isbn = {1070-9908},
	Journal = {Signal Processing Letters, IEEE},
	Keywords = {cepstral analysis; channel bank filters; prediction theory; speech recognition; additive noise; cepstral coefficient; cepstrum; clean speech; dynamic centroid feature vector; dynamic spectral subband centroid; filter-bank; linear prediction analysis; mel-frequency cepstral coefficient; noisy environment; noisy speech recognition; robust speech recognition; spectral subband centroid; transitional spectral information; MFCC},
	Number = {2},
	Pages = {258--261},
	Title = {Recognition of noisy speech using dynamic spectral subband centroids},
	Ty = {JOUR},
	Url = {http://ieeexplore.ieee.org/iel5/97/28210/01261994.pdf?tp=&isnumber=&arnumber=1261994},
	Volume = {11},
	Year = {2004},
	Abstract = {Despite their widespread popularity as front-end parameters for speech recognition, the cepstral coefficients derived from either linear prediction analysis or a filter-bank are found to be sensitive to additive noise. In this letter, we discuss the use of spectral subband centroids for robust speech recognition. We show that centroids, if properly selected, can achieve recognition performance comparable to that of the mel-frequency cepstral coefficients (MFCCs) in clean speech, while delivering better performance than MFCC in noisy environments. A procedure is proposed to construct the dynamic centroid feature vector that essentially embodies the transitional spectral information. We discuss some properties of the proposed dynamic features.},
}

@inproceedings{Gajic:2001,
	Author = {Gajic, B. and Paliwal, K. K.},
	Booktitle = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing (ICASSP '01)},
	Doi = {10.1109/ICASSP.2001.940773},
	Keywords = {cepstral analysis; feature extraction; speech recognition; white noise; additive white noise; amplitude information; automatic speech recognition task; cepstral coefficients; frequency information; noise-free conditions; robust feature extraction; short-term power spectrum; speech power spectrum; subband spectral centroid histograms; MFCC},
	Pages = {85--88},
	Title = {Robust feature extraction using subband spectral centroid histograms},
	Url = {http://ieeexplore.ieee.org/iel5/7486/20365/00940773.pdf?tp=&isnumber=&arnumber=940773},
	Volume = {1},
	Year = {2001},
	Abstract = {In this paper we propose a new framework for utilizing frequency information from the short-term power spectrum of speech. Feature extraction is based on the cepstral coefficients derived from the histograms of subband spectral centroids (SSC). Two new feature extraction algorithms are proposed, one based on frequency information alone, and the other which efficiently combines the frequency and amplitude information from the speech power spectrum. Experimental study on an automatic speech recognition task shows that the proposed methods outperform the conventional speech front-ends in the presence of additive white noise, while they perform comparably in the noise-free conditions},
}

@inproceedings{Schubert:2004,
	Address = {North Western University, Illinois},
	Author = {Schubert, E. and Wolfe, J. and Tarnopolsky, A.},
	Booktitle = {Proceedings of the 8th International Conference on Music Perception and Cognition (ICMPC 04)},
	Keywords = {timbre},
	Pages = {654--657},
	Title = {Spectral centroid and timbre in complex, multiple instrumental textures},
	Url = {http://icmpc8.umn.edu/proceedings/ICMPC8/PDF/AUTHOR/MP040215.PDF},
	Year = {2004},
	Abstract = {This paper investigates the dependence of perceived timbral brightness on pitch and spectral centroid for single notes and pairs of simultaneous notes. In both cases, brightness is better correlated with the spectral centroid fc than with the ratio of fc to the pitches of the notes.

},
}

@article{johnston1988tca,
	Author = {Johnston, J. D.},
	Journal = {IEEE Journal on Selected Areas in Communications},
	Number = {2},
	Pages = {314--323},
	Title = {Transform coding of audio signals using perceptual noise criteria},
	Url = {http://ieeexplore.ieee.org/iel1/49/39/00000608.pdf?tp=&isnumber=39&arnumber=608},
	Volume = {6},
	Year = {1988},
	Abstract = {A 4-b/sample transform coder is designed using a psychoacoustically derived noise-making threshold that is based on the short-term spectrum of the signal. The coder has been tested in a formal subjective test involving a wide selection of monophonic audio inputs. The signals used in the test were of 15-kHz bandwidth, sampled at 32 kHz. The bit rate of the resulting coder was 128 kb/s. The subjective test shows that the coded signal could not be distinguished from the original at that bit rate. Subsequent informal work suggests that a bit rate of 96 kb/s may maintain transparency for the set of inputs used in the test},
}

@article{Zahorian:1993,
	Author = {Zahorian, S. A. and Jagharghi, A. J.},
	Doi = {10.1121/1.407520},
	Journal = {The Journal of the Acoustical Society of America},
	Pages = {1966},
	Publisher = {ASA},
	Title = {Spectral-shape features versus formants as acoustic correlates for vowels},
	Volume = {94},
	Year = {1993},
	Abstract = {The first three formants, i.e., the first three spectral prominences of the short-time magnitude spectra, have been the most commonly used acoustic cues for vowels ever since the work of Peterson and Barney [J. Acoust. Soc. Am. 24, 175--184 (1952)]. However, spectral shape features, which encode the global smoothed spectrum, provide a more complete spectral description, and therefore might be even better acoustic correlates for vowels. In this study automatic vowel classification experiments were used to compare formants and spectral-shape features for monopthongal vowels spoken in the context of isolated CVC words, under a variety of conditions. The roles of static and time-varying information for vowel discrimination were also compared. Spectral shape was encoded using the coefficients in a cosine expansion of the nonlinearly scaled magnitude spectrum. Under almost all conditions investigated, in the absence of fundamental frequency (F0) information, automatic vowel classification based on spectral-shape features was superior to that based on formants. If F0 was used as an additional feature, vowel classification based on spectral shape features was still superior to that based on formants, but the differences between the two feature sets were reduced. It was also found that the error pattern of perceptual confusions was more closely correlated with errors in automatic classification obtained from spectral-shape features than with classification errors from formants. Therefore it is concluded that spectral-shape features are a more complete set of acoustic correlates for vowel identity than are formants. In comparing static and time-varying features, static features were the most important for vowel discrimination, but feature trajectories were valuable secondary sources of information.},
}

@article{Wessel:1979,
	Author = {Wessel, D.},
	Journal = {Computer Music Journal},
	Number = {2},
	Pages = {45--52},
	Title = {Timbre space as a musical control structure},
	Url = {http://mediatheque.ircam.fr/articles/textes/Wessel78a/},
	Volume = {3},
	Year = {1979},
	Abstract = {Research on musical timbre typically seeks representations of the perceptual structure inherent in a set of sounds that have implications for expressive control over the sounds in composition and performance. With digital analysis-based sound synthesis and with experiments on tone quality perception, we can obtain representations of sounds that suggest ways to provide low-dimensional control over their perceptually important properties.

In this paper, we will describe a system for taking subjective measures of perceptual contrast between sound objects and using this data as input to some computer programs. The computer programs use multidimensional scaling algorithms to generate geometric representations from the input data. In the timbral spaces that result from the scaling programs, the various tones can be represented as points and a good statistical relationship can be sought between the distances in the space and the contrast judgments between the corresponding tones. The spatial representation is given a psychoacoustical interpretation by relating its dimensions to the acoustical properties of the tones. Controls are then applied directly to these properties in synthesis. The control schemes to be described are for additive synthesis and allow for the manipulation of the evolving spectral energy distribution and various temporal features of the tones. Tests of the control schemes have been carried out in musical contexts. Particular emphasis will be given here to the construction of melodic lines in which the timbre is manipulated on a note-to-note basis. Implications for the design of human control interfaces and of software for real-time digital sound synthesizers will be discussed.},
}

@article{depoli1997smt,
	Author = {De Poli, G. and Prandoni, P.},
	Journal = {Journal of New Music Research},
	Number = {2},
	Pages = {170--197},
	Title = {Sonological models for timbre characterization},
	Url = {http://lcavwww.epfl.ch/~prandoni/documents/timbre2.pdf},
	Volume = {26},
	Year = {1997},
	Abstract = {In the research on timbre, two important variables have to be assigned from the onset: the instruments used to analyze and to model the physical sound, and the techniques employed to provide an efficient and manageable representation of the data. The experimental methodology which results from these choices defines a sonological model; several different psychoacoustical and analytical tools have been employed to this aim in the past. In this paper we will present a series of experiments...},
}

@inproceedings{Loureiro:2004,
	Author = {Loureiro, M. A. and de Paula, H. B. and Yehia, H. C.},
	Booktitle = {Proceedings of the 5th International Symposium on Music Information Retrieval (ISMIR 2004)},
	Pages = {546--549},
	Title = {TIMBRE CLASSIFICATION OF A SINGLE MUSICAL INSTRUMENT},
	Url = {http://ismir2004.ismir.net/proceedings/p099-page-546-paper199.pdf},
	Year = {2004},
	Abstract = {In order to map the spectral characteristics of the large variety of sounds a musical instrument may produce, different notes were performed and sampled in several intensity levels across the whole extension of a clarinet. Amplitude and frequency time-varying curves of partials were measured by Discrete Fourier Transform. A limited set of orthogonal spectral bases was derived by Principal Component Analysis techniques. These bases defined spectral sub-spaces capable of representing all tested sounds and of grouping them according to the distance metrics of the representation. 

A clustering algorithm was used to infer timbre classes. Preliminary tests with resynthesized sounds with normalized pitch showed a strong relation between the perceived timbre and the cluster label to which the notes were assigned. Self-Organizing Maps lead to results similar to those obtained by PCA representation and K-means clustering algorithm.   },
}

@inproceedings{Terasawa:2005,
	Author = {Terasawa, H. and Slaney, M. and Berger, J.},
	Booktitle = {Proceedings of the International Conference on Auditory Display (ICAD05)},
	Title = {Perceptual distance in timbre space},
	Url = {http://cobweb.ecn.purdue.edu/~malcolm/ibm/pubs/Terasawa2005(PerceptualTimbreSpace).pdf},
	Year = {2005},
	Abstract = {This paper describes a perceptual space for timbre, defines an objective metric that takes into account perceptual orthogonality and measures the quality of timbre interpolation. We discuss two timbre representations and measure perceptual judgment. We determined that a timbre space based on mel-frequency cepstral coefficients (MFCC) is a good model for perceptual timbre space.


},
}

@misc{Stowell:2006,
	Author = {Stowell, D.},
	Howpublished = {SuperCollider symposium, Birmingham},
	Month = {July},
	Title = {{G}enetic {A}lgorithms and Live Evolution},
	Url = {http://mcld.co.uk/supercollider/DanStowell-GA-SCsymposium_2006.rtf},
	Year = {2006},
}

@mastersthesis{Shirashi:2006,
	Author = {Shirashi, S.},
	Keywords = {LFCCs, timbre},
	Month = {June},
	School = {Institute of Sonology, The Hague},
	Title = {A Real-Time Timbre Tracking Model Based on Similarity},
	Url = {http://www.koncon.nl/public_site/220/Sononieuw/NL/thesis-pdf/SatoshiShiraishiThesis.pdf},
	Year = {2006},
	Abstract = {With the introduction of the computer as a musical tool, the role of timbre in composition has been largely emphasized. Until today, many sound synthesis models have been introduced, each one offering a different representation for timbre. Nevertheless, current definition of timbre reveals our relatively poor understanding about what timbre is. When we try to extract timbral properties from an acoustic sound, this becomes crucial. 

The real-time timbre tracking model presented in this thesis tackles this difficult issue. By following the essence of Timbre Space technique extensively used in the non real-time timbre recognition field, the model tracks timbre of a monophonic acoustic instrument. Unlike conventional approaches, in this model, timbre is described by its similarity/dissimilarity. 

The model is fully implemented in the Max/MSP programming environment, and can be applied for digital instrument designs and in interactive computer music, providing another possibility of using acoustic instrumentalist's musical expressivity in computer music. 
},
}

@book{Soto-Morettini:2006,
	Author = {Soto-Morettini, D.},
	Isbn = {978-0713672664},
	Keywords = {Donna},
	Publisher = {A {\&} C Black},
	Title = {Popular Singing: A Practical Guide To: Pop, Jazz, Blues, Rock, Country and Gospel},
	Year = {2006},
	Abstract = {This unique book is a practical guide to exploring the singing voice and will help to enhance vocal confidence in a range of styles including Pop, Jazz, Blues, Rock, Country and Gospel. Both singers and voice teachers will benefit from the author's clear analysis of these styles and advice on how to improve performance. ``Popular Singing'' provides effective alternatives to traditional voice training methods and demonstrates how these methods can be used to create a flexible and unique sound. A free CD of voice demonstrations is also included.},
}

@article{Hunt:2002,
	Author = {Hunt, A. and Wanderley, M. M.},
	Doi = {10.1017/S1355771802002030},
	Journal = {Organised Sound},
	Number = {2},
	Pages = {97--108},
	Title = {Mapping performer parameters to synthesis engines},
	Url = {http://www.journals.cambridge.org/action/displayAbstract?fromPage=online&aid=138211},
	Volume = {7},
	Year = {2002},
	Abstract = {This paper considers the issues involved in the design of electronic and computer interfaces, specifically mapping -- the designed link between an instrument's playing interface and its sound source. It defines the problem area, reviews the literature, and gives examples of specific system mappings. A general model is presented, with the aim of providing a framework for future discussions on what makes an effective mapping. Several guidelines for mapping strategies are given, based on existing work. 

},
}

@inproceedings{puckette2004ldp,
	Author = {Puckette, M.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'04)},
	Pages = {406--408},
	Title = {Low-dimensional parameter mapping using spectral envelopes},
	Url = {http://lena.ucsd.edu/~msp/Publications/icmc04.pdf},
	Year = {2004},
	Abstract = {We explore the technique of controlling synthesis using an in- 
strumental or other sound source. The range of spectra avail- 
able from the sound source, and also that available from the 
synthesis technique, are estimated and the first is mapped to 
the second. Suitable synthesis parameters for the synthesis 
algorithms are found by searching a database of known out- 
put spectra. A simple experiment illustrates the technique. 
},
}

@article{Krom:1995,
	Author = {Krom, G.},
	Journal = {Journal of Speech, Language and Hearing Research},
	Keywords = {breathiness, roughness, spectral parameters, voice quality},
	Month = {August},
	Number = {4},
	Pages = {794--811},
	Publisher = {ASHA},
	Title = {Some Spectral Correlates of Pathological Breathy and Rough Voice Quality for Different Types of Vowel Fragments},
	Volume = {38},
	Year = {1995},
	Abstract = {This study deals with the relation between listeners' ratings of pathological breathiness and roughness and certain characteristics of the voice spectrum. Two general research questions were addressed: First, which spectral parameters may serve as useful predictors of breathiness and roughness? Second, does the type of speech fragment used for analysis have an effect on the obtained regression model? Listener ratings of breathiness and roughness were obtained for three types of vowel fragments: a vowel onset segment, a mid-vowel (post-onset) segment, and a vowel segment covering the onset and the acoustically more stable post-onset parts. Results indicated that the harmonics-to-noise ratio was the best single predictor of both rated breathiness and roughness, explaining up to 54% of the true rating variance. By combining different predictors, between 75% and 80% of the breathiness variance could be explained for all three types of fragments. For roughness, a strong effect of fragment type was observed, with most variance explained in vowel onset fragments (71%), and least in post-onset fragments (52%). The effect of fragment type was also observed when regression analyses were performed with six predictors based on a factor analysis of the acoustic data.}}

@article{blomgren1998aap,
	Author = {Blomgren, M. and Chen, Y. and Ng, M.L. and Gilbert, H.R.},
	Doi = {10.1121/1.422785},
	Journal = {The Journal of the Acoustical Society of America},
	Pages = {2649},
	Publisher = {ASA},
	Title = {Acoustic, aerodynamic, physiologic, and perceptual properties of modal and vocal fry registers},
	Url = {http://www.health.utah.edu/cmdis/Faculty/Blomgren/Acoustics%20of%20modal%20and%20vocal%20fry%20registers.pdf},
	Volume = {103},
	Year = {1998},
	Abstract = {The purpose of the study was to examine the acoustic, aerodynamic, physiologic, and perceptual characteristics of modal and vocal fry production. Twenty normal speakers (10 males, 10 females) participated in the study. Speech material included four sustained vowels (/i/,  //,  /{\ae}/,  /u/), and syllable strings of /pi/ repetitions produced in both modal and vocal fry registers. Acoustic data (fundamental frequency, jitter, shimmer, and signal-to-noise ratio), aerodynamic data (airflow and air pressure), and electroglottographic (EGG) data were obtained simultaneously. Results demonstrated considerable differences across voice parameters for the modal and vocal fry registers. Fundamental frequency was significantly lower in vocal fry than in modal register for both males and females, however, significant gender differences existed only in modal register. For both males and females, measurements of jitter and shimmer were significantly higher and signal to noise ratio was significantly lower in vocal fry. In addition, airflow rate in modal register was almost three times as high as the airflow rate in vocal fry register during sustained vowel production. During syllable string production, subglottal air pressure values in modal register were approximately 1.5 times higher than that in the vocal fry register. In general, these data emphasize that the aeromechanical mechanisms of vocal fold vibratory behavior are substantially different between modal and vocal fry registers. A model of vocal fry phonation is presented to account for the present results. },
}

@article{Svec:1996,
	Author = {Svec, J. G. and Schutte, H. K. and Miller, D.G},
	Journal = {Journal of Speech and Hearing Research},
	Keywords = {vocal folds, stroboscopy, vibrational modes, bifurcation, entrainment},
	Month = {February},
	Pages = {135--143},
	Title = {A subharmonic vibratory pattern in normal vocal folds},
	Volume = {39},
	Year = {1996},
	Abstract = {This study observes in detail an F0/2 (sounding an octave below an original tone) subharmonic vibratory pattern produced in a normal larynx. Simultaneous electroglottographic and photoglottographic measurements reveal two different open phases within a subharmonic cycle---the first shorter with a simple shape, the second longer with a shape containing a ``ripple.'' Such parameters as the large open quotient (ca. 0.8) and the high airflow values (ca. 1000 cm3/s) distinguish this phonation from the vocal fry (pulse) register. Using an electronic divider to track the subharmonic frequency, a method has been developed to observe the subharmonic vibration of the vocal folds stroboscopically. The stroboscopic visualization reveals an unusual mucosal movement during the ``ripple,'' characterized by an opening movement of the upper margins, which interrupts the closing movement of the vocal folds. An explanation is offered that this vibratory pattern arises as a consequence of detuning of the usually identical frequencies of the dominant modes of the vocal folds, with 3:2 entrainment replacing the normal 1:1 pattern.},
}

@article{Kreiman:1993,
	Author = {Kreiman, J. and Gerratt, B. R. and Precoda, K. and Berke, G. S.},
	Doi = {10.1121/1.406275},
	Journal = {The Journal of the Acoustical Society of America},
	Month = {April},
	Number = {4},
	Pages = {2337},
	Title = {Perception of supraperiodic voices (A)},
	Volume = {93},
	Year = {1993},
	Abstract = {The study of voice quality is built largely upon the assumption of vocal (quasi-)periodicity. Supraperiodic phonation types challenge traditional concepts of vocal periodicity, and thus are of theoretic and practical importance, especially when measuring or describing pathologic vocal function. Unfortunately, the literature describing such phonation is confusing. It has been argued [Gerratt et al., J. Acoust. Soc. Am. Suppl. 1 83, S66 (1988)] that the traditional term ``diplophonia'' has been applied to two distinct phonation types. In one (true diplophonia), the two vocal folds vibrate at different rates, producing a waveform resembling a high-frequency wave modulated by a lower frequency envelope. In the second phonation type (``bicyclicity''), a pattern of two cycles repeats; cycles within the pattern differ in period and/or amplitude, giving a big--small--big--small appearance to the waveform. To investigate the perceptual characteristics of these phonation types, sets of male and female voices were constructed that included equal numbers of diplophonic, bicyclic, and noisy voices. Expert listeners judged the dissimilarity of pairs of these voices. Multidimensional scaling analyses confirmed that bicyclicity and diplophonia are easily distinguished from each other and from rough or breathy voices. Psychoacoustic studies examining factors underlying the nature of the bicyclic and diplophonic percepts will also be described. Implications of findings for periodicity-dependent models of laryngeal function will be discussed.},
}

@inproceedings{Essid:2005,
	Author = {Essid, S. and Leveau, P. and Richard, G. and Daudet, L. and David, B.},
	Booktitle = {Proc AES 118th Convention},
	Month = {May},
	Title = {On the usefulness of differentiated transient/steady-state processing in machine recognition of musical instruments},
	Url = {http://www.lam.jussieu.fr/src/Membres/Daudet/Publis/Essid_AES118.pdf},
	Year = {2005},
	Abstract = {This paper addresses the usefulness of the segmentation of musical sounds into transient/non-transient parts 
for the task of machine recognition of musical instruments. We put into light the discriminative power of the 
attack-transient segments on the basis of ob jective criteria, consistent with the well-known psychoacoustics 
findings. The sound database used is composed of real-world mono-instrument phrases. Moreover, we show 
that, paradoxically, it is not always optimal to consider such a segmentation of the audio signal in a machine 
recognition system for a given decision window. Our evaluation exploits efficient automatic segmentation 
techniques, a wide variety of signal processing features as well as feature selection algorithms and support 
vector machine classification. 
},
}

@inproceedings{iseli1icf,
	Author = {Iseli, M. and Alwan, A.},
	Booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing},
	Month = {May},
	Title = {AN IMPROVED CORRECTION FORMULA FOR THE ESTIMATION OF HARMONIC MAGNITUDES AND ITS APPLICATION TO OPEN QUOTIENT ESTIMATION},
	Url = {http://www.ee.ucla.edu/~spapl/paper/iseli04.pdf},
	Year = {2004},
	Abstract = {Many voice quality parameters, such as the open quotient (OQ), 
depend on an accurate estimate of the source spectrum. It is known 
that OQ, for example, is correlated with the magnitude difference 
of the first two harmonics (H1 − H2 ) of the speech source spec- 
trum. In order to compare OQ estimates across different vocal 
tract configurations, magnitude correction, achieved by removing 
the influence of vocal tract resonances, has to be used. The im- 
proved correction described in this paper is inspired by a correc- 
tion formula in [1]. The new correction formula accounts for the 
bandwidths of all vocal tract resonances, and most importantly, is 
not limited to the analysis of non-high vowels as is the case in [1]. 
H1 − H2 estimates, using the proposed technique with synthe- 
sized vowels generated with the LF and the KLGLOTT88 models, 
are very accurate. 
},
}

@phdthesis{Hanson:1995,
	Author = {Hanson, H. M.},
	School = {Division of Applied Sciences, Harvard University},
	Title = {Glottal characteristics of female speakers},
	Url = {http://hdl.handle.net/1721.1/22393},
	Year = {1995},
}

@article{schulman1989adl,
	Author = {Schulman, R.},
	Journal = {The Journal of the Acoustical Society of America},
	Pages = {295},
	Publisher = {ASA},
	Title = {{Articulatory dynamics of loud and normal speech}},
	Volume = {85},
	Year = {1989},
	Abstract = {A comparison was made between normal and loud productions of bilabial stops and stressed vowels. Simultaneous recordings of lip and jaw movement and the accompanying audio signal were made for four native speakers of Swedish. The stimuli consisted of 12 Swedish vowels appearing in an /ibb/ frame and were produced with both normal and increased vocal effort. The displacement, velocity, and relative timing associated with the individual articulators as well as their coarticulatory interactions were studied together with changes in acoustic segmental duration. It is shown that the production of loud as compared with normal speech is characterized by amplification of normal movement patterns that are predictable for the above articulatory parameters. In addition, it was observed that the acoustic durations of bilabial stops were shortened, whereas stressed vowels were lengthened during loud speech production. Two interpretations of the data are offered, viewing loud articulatory behavior as a response to production demands and perceptual constraints, respectively.},
}

@inproceedings{Collins:2004,
	Author = {Collins, N.},
	Booktitle = {Proceedings of Sound and Music Computing},
	Keywords = {Onset Detection, Audio Capture, Real-time Segmentation, Categorisation, BBCut},
	Month = {October},
	Pages = {20--22},
	Title = {On Onsets On-the-Fly: Real-time Event Segmentation and Categorisation as a Compositional Effect},
	Url = {http://www.cus.cam.ac.uk/%7Enc272/papers/pdfs/ononsetsonthefly.pdf},
	Year = {2004},
	Abstract = {Compositional applications for real-time event segmen- 
tation are discussed. A causal real-time onset detector 
which makes onset data available as fast as possible is 
introduced, based on work by Klapuri, Hainsworth and 
Jensen and Andersen. This analysis frontend informs al- 
gorithmic cutting procedures which respect the events of 
the incoming audio stream. A further refinement stores 
events to particular buffers based on a coarse categori- 
sation between snare, kick or hihat classes. Algorithmic 
composers running playback of these buffers thereby re- 
spond to changing timbral events of a live feed from an 
instrumentalist or other audio source. The use of an onset 
detection threshold to create abstracted rhythms based on 
some existing source is further examined. 
},
}

@mastersthesis{blankenship:1997,
	Author = {Blankenship, B.},
	School = {Department of Linguistics, UCLA},
	Title = {The time course of breathinesss and laryngealization in vowels},
	Url = {http://www.linguistics.ucla.edu/faciliti/research/blankenship.pdf},
	Year = {1997},
	Abstract = {In a language where breathiness or laryngealization is a contrastive property 
of vowels, such non-modal phonation lasts longer and may be of greater magnitude 
than in a language where it is an accident of consonant context.  In Tagalog, 
breathy phonation occurs incidentally on vowels after /h/, and laryngealized 
phonation occurs after glottal stops.  Mazatec, on the other hand, employs breathy 
and laryngealized vowels as separate phonemes that contrast with modal vowels. 
Several acoustic measures show that the difference between nonmodal and modal 
vowels is stronger and lasts longer in Mazatec than in Tagalog.  Contrary to 
expectations, cross-speaker variation is not greater in Tagalog. 

The main experiment examined words from 6 male and 6 female speakers of 
each language.  A second experiment used 4 male and 4 female speakers of Chong, 
1 male speaker of Mpi, and 1 male and 10 female speakers of Navajo.  Three 
breathy, 3 laryngealized, and 3 modal vowels from each speaker were analyzed.  To 
determine the time course of phonation effects, measurements were made at 25 ms 
intervals through each vowel.  The measurements were H1-H2 (hypothesized to 
reflect the open quotient of the glottal vibration),  H1-F2 (an approximation of 
spectral slope, hypothesized to reflect the abruptness of vocal fold closure), and 
cepstral peak prominence (a measure of periodicity).},
}

@article{holmberg1995caa,
	Author = {Holmberg, E.B. and Hillman, R.E. and Perkell, J.S. and Guiod, P.C. and Goldman, S.L.},
	Journal = {Journal of Speech, Language and Hearing Research},
	Keywords = {female voice, glottal airflow waveform, glottal aperture, acoustic spectral slope, electroglottography},
	Number = {6},
	Pages = {1212--1223},
	Publisher = {ASHA},
	Title = {Comparisons Among Aerodynamic, Electroglottographic, and Acoustic Spectral Measures of Female Voice},
	Volume = {38},
	Year = {1995},
	Abstract = {This study examines measures of the glottal airflow wave form, the electroglottographic signal (EGG), amplitude differences between peaks in the acoustic spectrum, and observations of the spectral energy content of the third formant (F3), in terms of how they relate to one another. Twenty females with normal voices served as subjects. Both group and individual data were studied. Measurements were made for the vowel in two speech tasks: strings of the syllable /p{\ae}/ and sustained phonation of /{\ae}/, which were produced at two levels of vocal effort: comfortable and loud voice. The main results were:

1. Significant differences in parameter values between /p{\ae}/ and /{\ae}/ were related to significant differences in the sound pressure level (SPL).

2. An ``adduction quotient,'' measured from the glottal waveform at a 30% criterion, was sensitive enough to differentiate between waveforms reflecting abrupt versus gradual vocal fold closing movements.

3. DC flow showed weak or nonsignificant relationships with acoustic measures.

4. The spectral content in the third formant (F3) in comfortable loudness typically consisted of a mix of noise and harmonic energy. In loud voice, the F3 spectral content typically consisted of harmonic energy.

5. Significant differences were found in all measures between tokens with F3 harmonic energy and tokens with F3 noise, independent of loudness condition.

6. Strong relationships between flow- and EGG-adduction quotients suggested that these signals can be used to complement each other.

7. The amplitude difference between spectral peaks of the first and third formant (F1--F3) was found to add information about abruptness of airflow decrease (flow declination) that may be lost in the glottal waveform signal due to low-pass filtering.

The results are discussed in terms of how an integrated use of these measures can contribute to a better understanding of the normal vocal mechanism and help to improve methods for evaluating vocal function.},
}

@techreport{Nordenberg:2003,
	Author = {Nordenberg, M. and Sundberg, J.},
	Institution = {KTH Voice Research Centre, Department of Speech Music and Hearing, KTH},
	Number = {45},
	Title = {Effect on {LTAS} of vocal loudness variation},
	Url = {http://www.speech.kth.se/prod/publications/files/qpsr/2003/2003_45_1_093-100.pdf},
	Year = {2003},
	Abstract = {Long-term-average spectrum (LTAS) is an efficient method for voice analysis, 
revealing both voice source and formant characteristics. However, the LTAS 
contour is non-uniformly affected by vocal loudness. This variation was analysed 
in 15 male and 16 female untrained voices reading a text 7 times at different 
degrees of vocal loudness, 62 ≤ mean Leq ≤ 91 dB @0.3m. In all frequency bands 
up to 4 kHz spectrum level was strongly and linearly correlated with overall 
equivalent sound level (Leq). The gain factor, i.e., the rate of level increase, varied 
with frequency, from about 0.5 at low frequencies to about 1.5 between 1.5 and 3 
kHz. Using the gain factors for a voice, LTAS contours could be predicted at any 
Leq within the measured range, with an average accuracy of 2 to 3 dB below 4 
kHz. Mean LTAS calculated for an Leq of 70 dB for each subject showed 
considerable individual variation for both males and females. On the other hand, 
the results also indicate that meaningful comparisons of LTAS, recorded e.g., 
before and after voice therapy, can be made, provided that the documentation 
includes a minimum of three recordings at different loudness levels from one 
recording session.  
},
}

@article{alku2002naq,
	Author = {Alku, P. and B{\"a}ckstr{\"o}m, T. and Vilkman, E.},
	Journal = {The Journal of the Acoustical Society of America},
	Pages = {701},
	Publisher = {ASA},
	Title = {Normalized amplitude quotient for parametrization of the glottal flow},
	Volume = {112},
	Year = {2002},
	Abstract = {Normalized amplitude quotient (NAQ) is presented as a method to parametrize the glottal closing phase using two amplitude-domain measurements from waveforms estimated by inverse filtering. In this technique, the ratio between the amplitude of the ac flow and the negative peak amplitude of the flow derivative is first computed using the concept of equivalent rectangular pulse, a hypothetical signal located at the instant of the main excitation of the vocal tract. This ratio is then normalized with respect to the length of the fundamental period. Comparison between NAQ and its counterpart among the conventional time-domain parameters, the closing quotient, shows that the proposed parameter is more robust against distortion such as measurement noise that make the extraction of conventional time-based parameters of the glottal flow problematic. Experiments with breathy, normal, and pressed vowels indicate that NAQ is also able to separate the type of phonation effectively.},
}

@article{sundberg2006evl,
	Author = {Sundberg, J. and Nordenberg, M.},
	Journal = {The Journal of the Acoustical Society of America},
	Number = {1},
	Pages = {453--457},
	Publisher = {ASA},
	Title = {Effects of vocal loudness variation on spectrum balance as reflected by the alpha measure of long-term-average spectra of speech},
	Volume = {120},
	Year = {2006},
	Abstract = {The overall slope of long-term-average spectrum (LTAS) decreases if vocal loudness increases. Therefore, changes of vocal loudness also affects the alpha measure, defined as the ratio of spectrum intensity above and below 1000 Hz. The effect on alpha of loudness variation was analyzed in 15 male and 16 female voices reading a text at different degrees of vocal loudness. The mean range of equivalent sound level (L(eq)) amounted to about 28 dB and the mean range of alpha to 19.0 and 11.7 dB for the female and male subjects. The L(eq) vs. alpha relationship could be approximated with a quadratic function, or by a linear equation, if softest phonation was excluded. Using such equations alpha was computed for all values of L(eq) observed for each subject and compared with observed values. The maximum and the mean absolute errors were 2.4 dB and between 0.1 and 0.6 dB. When softest phonation was disregarded and linear equations were used, the maximum error was less than 2 dB and the mean absolute errors were between 0.2 and 0.7 dB. The strong correlation between L(eq) and alpha indicates that for a voice L(eq) can be used for predicting alpha.},
}

@article{Borch:2004,
	Author = {Borch, D. Zangger and Sundberg, J. and Lindestad, P. -{\AA}. and Thal{\'e}n, M.},
	Doi = {10.1080/14015430410016073},
	Journal = {Logopedics Phoniatrics Vocology},
	Number = {4},
	Pages = {147--153},
	Title = {Vocal fold vibration and voice source aperiodicity in `dist' tones: a study of a timbral ornament in rock singing},
	Url = {http://dx.doi.org/10.1080/14015430410016073},
	Volume = {29},
	Year = {2004},
	Abstract = {The acoustic characteristics of so-called 'dist' tones, commonly used in singing rock music, are analyzed in a case study. In an initial experiment a professional rock singer produced examples of 'dist' tones. The tones were found to contain aperiodicity, SPL at 0.3 m varied between 90 and 96 dB, and subglottal pressure varied in the range of 20-43 cm H2O, a doubling yielding, on average, an SPL increase of 2.3 dB. In a second experiment, the associated vocal fold vibration patterns were recorded by digital high-speed imaging of the same singer. Inverse filtering of the simultaneously recorded audio signal showed that the aperiodicity was caused by a low frequency modulation of the flow glottogram pulse amplitude. This modulation was produced by an aperiodic or periodic vibration of the supraglottic mucosa. This vibration reduced the pulse amplitude by obstructing the airway for some of the pulses produced by the apparently periodically vibrating vocal folds. The supraglottic mucosa vibration can be assumed to be driven by the high airflow produced by the elevated subglottal pressure.},
}

@inproceedings{Laukkanen:2004,
	Author = {Laukkanen, A. M. and Sundberg, J. and Bj{\"o}rkner, E.},
	Booktitle = {Annual Symposium: Care of the Professional Voice},
	Pages = {13--23},
	Title = {Acoustic study of the ``throaty'' voice quality},
	Volume = {46},
	Year = {2004},
	Abstract = {``Throaty'' voice quality has been regarded by voice pedagogues as undesired and
even harmful. The present study attempts to identify acoustic and physiological
correlates of this quality. One male and one female subject read a text habitually
and with a throaty voice quality. Oral pressure during p-occlusion was measured
as an estimate of subglottic pressure. Long-term-average spectrum (LTAS) analysis
was used to describe the average voice quality. Sixteen syllables, perceptually
evaluated with regard to throaty quality by five experts, were selected for further
analyses. Formant frequencies and voice source characteristics were measured by
means of inverse filtering, and the vocal tract shape of the male subject's throaty
and normal versions of the vowels [a,u,i,ae] was recorded by Magnetic Resonance
imaging. From this material area functions were derived and their resonance
frequencies were determined. To test the relevance of formant frequencies to
perceived throaty quality, experts rated degree of throatiness in synthetic vowel
samples in which the subjects' measured formant frequency values were used.

The main acoustic correlates of throatiness seemed to be an increase of F1, a
decrease of F4 and in front vowels also a decrease of F2, presumably resulting
from a narrowing of the pharynx. In the male subject voice source parameters
suggested a more hyperfunctional voice in throaty samples.},
}

@phdthesis{Laukka:2004,
	Author = {Laukka, P.},
	Month = {December},
	School = {Uppsala University},
	Title = {Vocal expression of emotion},
	Url = {http://www.diva-portal.org/diva/getDocument?urn_nbn_se_uu_diva-4666-1__fulltext.pdf},
	Year = {2004},
}

@article{edmondson2006vta,
	Author = {Edmondson, J.A. and Esling, J.H.},
	Doi = {10.1017/S095267570600087X},
	Journal = {Phonology},
	Month = {November},
	Number = {02},
	Pages = {157--191},
	Publisher = {Cambridge Univ Press},
	Title = {The valves of the throat and their functioning in tone, vocal register and stress: laryngoscopic case studies},
	Volume = {23},
	Year = {2006},
	Abstract = {The standard method of describing phonation for tone, vocal register, stress and other linguistic categories relies on the `continuum hypothesis' that linguistic sounds are produced by means of glottal states determined by the aperture between the arytenoid cartilages, the endpoints of the voiceless--voiced continuum being `open glottis' and `closed glottis'. This paper takes a different view, pointing out that many languages make use of a number of valves, and that these valves are not articulations on a glottal continuum but represent a synergistic and hierarchical system of laryngeal articulations. These valves constitute a principal source of phonological contrast, with an influence on how oral articulatory events are characterised.},
}

@mastersthesis{kehoe:2001,
	Author = {Kehoe, A. and Sophister, S.},
	Title = {The Analysis and Synthesis of Three Voice Qualities},
	Url = {http://www.netsoc.tcd.ie/~hcksplat/work/speech-science.pdf},
	Year = {2001},
	Abstract = {In this paper, I present work dealing with the analysis and synthesis of three voice 
qualities, as defined in (Laver, 1980). The voice qualities in question are modal 
voice, tense voice and whispery voice. This was carried out in the context of 
course work for LI1803, the final-year class in speech analysis and synthesis for 
C SLL (Computer Science, Linguistics and a Language), and draws on the papers 
and research covered in that class. 

The first section presents some background and the aims of the project. It sum- 
marises Laver's classification of voice qualities, and describes the qualities under 
discussion in terms of his three tension parameters, also presenting something of 
their expected acoustic characteristics. 

The second explains the analysis techniques used, describing in detail inverse 
filtering and model matching, which are used for extracting quantifiable measures 
of the voice source. It presents the materials analysed, together with the synthe- 
siser used, and the mathematical conversions necessary for generating appropriate 
(synthesiser) input parameters from the source parameters. 

The third shows the results, and averaged data for the three voice qualities are 
presented. These are discussed, with reference made to the results of the synthesis 
experiments based on the analyses. Further discussion is presented on the variation 
of parameters within the synthesiser to emphasise (and minimise) whispery voice 
quality. 
},
}

@article{wayland2003acb,
	Author = {Wayland, R. and Jongman, A.},
	Journal = {Journal of Phonetics},
	Number = {2},
	Publisher = {Elsevier Science},
	Title = {Acoustic correlates of breathy and clear vowels: the case of {K}hmer},
	Url = {http://www2.ku.edu/~kuppl/jongman/Wayland%20&%20Jongman%2003.pdf},
	Volume = {31},
	Year = {2003},
	Abstract = {This study investigates acoustic correlates of the putative breathy and clear phonation type contrast in a 
dialect of Khmer (Cambodian) spoken in Chanthaburi Province, Thailand. The goal is to determine 
whether this Khmer dialect still preserves this historical contrast. Out of seven acoustic parameters 
measured, four, namely   H1-H2 ;   H1-A1 ;   H1  - A3 ; and vowel RMS amplitude successfully 
distinguished between breathy and clear vowels, with   H1- H2 measured at the beginning of the vowel 
being the most robust cue. However, the use of these cues varied from speaker to speaker. The   H1-H2 
measurement obtained from male speakers' production suggested that the contrast being realized may be 
that of a tense versus lax voice rather than a breathy versus clear voice. It is concluded that the historical 
breathy and clear phonation distinction in Khmer is preserved among female speakers, but this distinction 
may be disappearing or have become a tense versus lax distinction among male speakers. 
},
}

@misc{wayland:acb,
	Author = {Wayland, R.P. and Hall, T.},
	Title = {Acoustic Correlates of Breathy and Clear Vowels in {C}hong: a Preliminary Analysis},
	Url = {http://grove.ufl.edu/~linclub/focus/Ratree.pdf},
}

@misc{bombien:vai,
	Author = {Bombien, L.},
	Title = {Voicing alterations in {I}celandic sonorants -- a photoglottographic and acoustic analysis},
	Url = {http://www.ipds.uni-kiel.de/pub_exx/aipuk/aipuk_37/37_6_Bombien_akfra.pdf},
}

@techreport{carlsson1992fft,
	Author = {Carlsson, G. and Sundberg, J.},
	Institution = {KTH, Department for Speech, Music and Hearing},
	Number = {1},
	Pages = {29--35},
	Title = {Formant frequency tuning in singing},
	Url = {http://www.speech.kth.se/prod/publications/files/qpsr/1991/1991_32_1_029-035.pdf},
	Volume = {32},
	Year = {1991},
}

@article{Waaramaa:2006,
	Author = {Waaramaa, Teija and Alku, Paavo and Laukkanen, Anne-Maria},
	Doi = {10.1080/14015430500456739},
	Journal = {Logopedics Phoniatrics Vocology},
	Number = {4},
	Pages = {153--156},
	Title = {The role of {F}3 in the vocal expression of emotions},
	Url = {http://dx.doi.org/10.1080/14015430500456739},
	Volume = {31},
	Year = {2006},
	Abstract = {The present study investigates the role of F3 in the perception of valence of emotional expressions by using a vowel [a:] with different F3 values: the original, one with F3 either lowered or raised by 30% in frequency, and one with F3 removed. The vowel [a:] was extracted from the simulated emotions, inverse filtered and manipulated. The resulting 12 synthesized samples were randomized and presented to 30 listeners who evaluated the valence (positiveness/negativeness) of the expressions. The vowel with raised F3 was perceived more often as positive than the sample with original (p=0.063), lowered (p=0.006) or removed F3 (p=0.066). F3 may affect perception of valence if the signal has sufficient energy in high frequency range.},
}

@article{watkins1997cpt,
	Author = {Watkins, J.},
	Journal = {SOAS Working Papers in Linguistics and Phonetics},
	Pages = {321--339},
	Title = {Can phonation types be reliably measured from sound spectra? Some data from {W}a and {B}urmese.},
	Url = {http://eprints.soas.ac.uk/archive/00000048/01/SOAS_WP_1997_-_Phonation_types_and_spectra.pdf},
	Volume = {7},
	Year = {1997},
}

@inproceedings{nordstrom2006ilf,
	Author = {Nordstrom, K. I. and Driessen, P. F. and Rutledge, G. A.},
	Booktitle = {IEEE Int. Symposium on Signal Processing and Information Technology (ISSPIT06)},
	Month = {August},
	Title = {Influence of the {LPC} filter upon the perception of breathiness and vocal effort},
	Url = {http://www.ece.uvic.ca/~knordstr/library/NordstromDriessen06b.pdf},
	Year = {2006},
	Abstract = {According to the source-filter paradigm, the perception of breath- 
iness and vocal effort should be primarily controlled by the glottal 
source and be little affected by the formant filter. This experiment 
investigates whether the formant filter estimated by linear prediction 
(LPC) can influence the perception of breathiness and vocal effort. 
The experiment starts with a pair of voice samples. One sample ex- 
hibits high effort and the other sample exhibits breathiness. LPC 
estimates a filter and residual for each sample. The influence of the 
residual is eliminated by providing both filters with the same artifi- 
cial source during resynthesis. The synthesized samples differ only 
according to the difference between the two filters. Three pairs of 
samples were evaluated by seven people in listening tests. The re- 
sults demonstrate that the LPC filters do influence the perception of 
breathiness and vocal effort. When a voice changes between breath- 
iness and vocal effort, the spectral envelope changes. This change is 
captured by the LPC filter rather than the residual. A closer look at 
the LPC algorithm provides an explanation for this result. 
},
}

@inbook{Airas:2004,
	Author = {Airas, M. and Alku, P.},
	Chapter = {Emotions in Short Vowel Segments: Effects of the Glottal Flow as Reflected by the Normalized Amplitude Quotient},
	Doi = {10.1007/b98229},
	Pages = {13-24},
	Publisher = {Springer Berlin / Heidelberg},
	Series = {Lecture Notes in Computer Science},
	Title = {Affective Dialogue Systems},
	Volume = {3068/2004},
	Year = {2004},
	Abstract = {Emotions in continuous speech were analyzed using inverse filtering and a recently developed glottal flow parameter, the normalized amplitude quotient (NAQ). Simulated emotion portrayals were produced by 9 professional stage actors. Segments of the vowel /a:/ were separated from continuous speech. The segments were inverse filtered and parametrized using NAQ. Statistical analyses showed significant differences between most studied emotions. Results also showed clear gender differences. Inverse filtering together with NAQ was shown to be a suitable method for analysis of emotional content in continuous speech.},
}

@article{Kreiman:2005,
	Author = {Kreiman, J. and Gerratt, B. R.},
	Journal = {Journal of the Acoustical Society of America},
	Number = {4},
	Pages = {2201-2211},
	Title = {Perception of aperiodicity in pathological voice},
	Url = {http://repositories.cdlib.org/postprints/1117},
	Volume = {117},
	Year = {2005},
	Abstract = {Although jitter, shimmer, and noise acoustically characterize all voice signals, their perceptual importance in naturally produced pathological voices has not been established psychoacoustically. To determine the role of these attributes in the perception of vocal quality, listeners were asked to adjust levels of jitter, shimmer, and the noise-to-signal ratio in a speech synthesizer, so that synthetic voices matched naturally produced tokens. Results showed that, although listeners agreed well in their judgments of the noise-to-signal ratio, they did not agree with one another in their chosen settings for jitter and shimmer. Noise-dependent differences in listeners' ability to detect changes in amounts of jitter and shimmer implicate both listener insensitivity and inability to isolate jitter and shimmer as separate dimensions in the overall pattern of aperiodicity in a voice as causes of this poor agreement. These results suggest that jitter and shimmer are not useful as independent indices of perceived vocal quality, apart from their acoustic contributions to the overall pattern of spectrally shaped noise in a voice.},
}

@mastersthesis{Quast:2001,
	Author = {Quast, H.},
	School = {Drittes Physikalisches Institut, Georg August Universit{\"a}t G{\"o}ttingen, and Machine Perception Lab, Institute for Neural Computation, University of California, San Diego},
	Title = {Automatic Recognition of Nonverbal Speech},
	Url = {http://ergo.ucsd.edu/~holcus/papers/Thesis.pdf},
	Year = {2001},
	Abstract = {In order to train a pattern recognition system to recognize affective speech, it must be 
provided with examples.  The next part, Chapter 2, describes how a database of speech 
recordings of German actors and nonactors is assembled.  These recordings are then evaluated 
in the seven categories pleasant {\~n} happy {\~n} confident {\~n} strong {\~n} agitated {\~n} leadership {\~n} angry 
by Californian listeners.  The results are normalized and refined to yield a measure of the 
affective content for each recording and category.  A second value, a confidence factor, is 
derived that describes how high listeners{\'\i} agreement was in each category, for each recording. 
The results are qualitatively compared to scores from German judges who received the 
evaluation program and submitted their scores over the internet. 

The third chapter deals with the acoustic parameters that represent a speech recording. 
These parameters fall into the categories fundamental frequency, intensity, and spectral 
composition.  To extract these, a pitch tracker is developed, a new psychoacoustically 
motivated speech loudness model is introduced, and other signal processing tools are build 
that lead to a total of 18 acoustic parameters representing each recording. 

To see if the affective, psycholinguistic value of a speech recording can be represented as 
a function of its signal processing parameters, neural networks are used as pattern recognition 
engines to map one set of features onto the other one.  This process is elaborated in Chapter 4. 
The network is trained by showing it example pairs of acoustic and affective data, and, if it 
has learned successfully, the network is able to generalize, i.e. see new examples, and be able 
to assess its affective content.  Since listeners{\'\i} agreement strongly fluctuated in the evaluation 
of the speech recording, the neural networks are programmed to consider the quality of a data 
point when learning from it by means of the confidence factor computed in the second 
chapter.  A new psychoacoustically motivated data representation technique called lombada, 
based on the loudness model developed in Chapter 3, is introduced.  With the lombada 
technique, the prosodic information can be stored at a fraction of the space occupied by the 
original recording. 

Chapter 5 shows how the affect recognizer developed here was successfully applied to 
build a nonverbal speech dialogue interface as can be used for a pet robot. 

Chapter 6 Concludes this work, discusses the findings and gives an outlook to possible 
future investigations in this field.},
}

@inproceedings{story2003pmv,
	Author = {Story, B.H.},
	Booktitle = {VOQUAL'03},
	Month = {August},
	Title = {Physical modeling of voice and voice quality},
	Url = {http://www.u.arizona.edu/~bstory/voqual03_story.pdf},
	Year = {2003},
	Abstract = {Physical modeling of the phonatory and vocal tract systems 
has served as a useful tool to study many aspects of speech pro- 
duction. This paper offers a brief review of two specific types 
of physically-based models. One for simulating the vibration 
of the vocal folds and another for representing the vocal tract 
shape in the form of an area function. While much of the tech- 
nical detail of these models has been presented elsewhere, the 
emphasis here will be on providing examples of how physical 
models may be used for studying voice quality.

},
}

@inproceedings{kreiman:dam,
	Author = {Kreiman, J. and Vanlancker-Sidtis, D. and Gerratt, B. R.},
	Booktitle = {Proceedings of From Sound To Sense: 50+ Years of Discoveries in Speech Communication},
	Month = {June},
	Organization = {MIT},
	Pages = {115--120},
	Title = {Defining and measuring voice quality},
	Url = {http://www.rle.mit.edu/soundtosense/conference/pdfs/fulltext/Saturday%20Posters/SB-Kreiman-STS.pdf},
	Year = {2004},
	Abstract = {Although voices provide listeners with significant information about speakers, defining and 
quantifying voice quality remain elusive goals.  The ANSI standard definition of quality (that 
attribute of auditory sensation in terms of which a listener can judge that two sounds similarly 
presented and having the same loudness and pitch are dissimilar) is often criticized because it 
specifies what quality is not, rather than what it is.  It has also proven difficult to devise 
measurement protocols for quality as specified in the ANSI definition.  We argue that the ANSI 
definition is in fact appropriate, because it treats quality as the result of perceptual processes--- 
interactions between listeners and signals in the context of specific perceptual goals.  
Application of speech synthesis in method-of-adjustment tasks allows measurement of quality 
psychoacoustically as those aspects of the signal that allow a listener to determine that two 
sounds of equal pitch and loudness are different, consistent with the ANSI definition, and 
provides insight into the salient acoustic attributes contributing to quality.  This technique holds 
promise for improving the reliability and validity of measures of voice quality.   
},
}

@article{Blankenship:2002,
	Author = {Blankenship, B.},
	Journal = {Journal of Phonetics},
	Month = {April},
	Number = {2},
	Pages = {163-191(29)},
	Title = {The timing of nonmodal phonation in vowels},
	Url = {http://www.ingentaconnect.com/content/ap/jp/2002/00000030/00000002/art00155},
	Volume = {30},
	Year = {2002},
	Abstract = {In a language where breathiness or laryngealization is a contrastive property of vowels, such nonmodal phonation lasts longer and is more differentiated from modal phonation than in a language where nonmodal phonation results from the influence of preceding consonants. In Tagalog, breathy phonation occurs incidentally on vowels after /h/, and laryngealized phonation occurs after glottal stops. Mazatec, on the other hand, employs breathy and laryngealized vowels as separate phonemes that contrast with modal vowels. Several acoustic measures show that nonmodal and modal vowels are differentiated more strongly and over a longer duration in Mazatec than in Tagalog.
An experiment examined words from six male and six female speakers of each of those languages, with corroborating modal and breathy vowels from four male and four female speakers of Chong, and modal and laryngealized vowels from one male speaker of Mpi. From each speaker, three vowels of each phonation type were analyzed. To determine the time course of phonation effects, measurements were made at 25 ms intervals through each vowel. The measurements were the amplitude differences between the first and second harmonic and between the first harmonic and the second formant, and cepstral peak prominence (a measure of periodicity).},
}

@article{Gerratt:2001,
	Author = {Gerratt, B. R. and Kreiman, J.},
	Journal = {Journal of Phonetics},
	Keywords = {vocal fry, voice quality, creaky voice, breathiness},
	Month = {October},
	Number = {5},
	Pages = {365-381},
	Title = {Toward a taxonomy of nonmodal phonation},
	Volume = {29},
	Year = {2001},
	Abstract = {The study of nonmodal phonation, like the study of other aspects of voice quality, spans many disciplines. Descriptions of such phonation abound, but variations in scope, purpose, terminology, measurement technique, and level of description make it difficult to compare vocal phenomena across disciplines, or even across studies within a single discipline. We demonstrate how hypotheses about which kinds of nonmodal phonation types are the same and which are different can be tested by studies of listeners' perceptions. Evidence suggests that period-doubled phonation, amplitude modulations, and vocal fry form perceptually distinctive qualities, which also have consistent acoustic and physiological correlates. Evidence is much more ambiguous for qualities like breathiness and creak, which vary continuously from modal phonation. A common theoretical framework for the description of vocal quality may eventually eliminate many impediments to unified description.},
}

@article{Gordon:2001,
	Author = {Gordon, M. and Ladefoged, P.},
	Journal = {Journal of Phonetics},
	Month = {October},
	Number = {4},
	Pages = {383-406},
	Title = {Phonation types: a cross-linguistic overview},
	Volume = {29},
	Year = {2001},
	Abstract = {Differences in phonation type signal important linguistic information in many languages, including contrasts between otherwise identical lexical items and boundaries of prosodic constituents. Phonation differences can be classified along a continuum ranging from voiceless, through breathy voiced, to regular, modal voicing, and then on through creaky voice to glottal closure. Cross-linguistic investigation suggests that this phonation continuum can be defined in terms of a recurring set of articulatory, acoustic, and timing properties. Nevertheless, there exist several languages whose phonation contrasts do not neatly fall within the phonation categories defined by other languages.},
}

@article{Scherer:2003,
	Address = {Amsterdam, The Netherlands, The Netherlands},
	Author = {Scherer, Klaus R.},
	Doi = {10.1016/S0167-6393(02)00084-5},
	Issn = {0167-6393},
	Journal = {Speech Commun.},
	Number = {1-2},
	Pages = {227--256},
	Publisher = {Elsevier Science Publishers B. V.},
	Title = {Vocal communication of emotion: a review of research paradigms},
	Volume = {40},
	Year = {2003},
}

@inbook{Harris:1995,
	Author = {Harris, J. and Lindsey, G.},
	Chapter = {The elements of phonological representation},
	Editor = {Durand, Jacques {\&} Francis Katamba},
	Pages = {34--79},
	Title = {Frontiers of phonology: atoms, structures, derivations},
	Url = {http://www.phon.ucl.ac.uk/home/johnh/papers/Durakata01.pdf},
	Year = {1995},
}

@book{Laver:1980,
	Author = {Laver, J.},
	Publisher = {Cambridge University Press},
	Series = {Cambridge Studies in Linguistics},
	Title = {The Phonetic Description of Voice Quality},
	Url = {http://www.ling.mq.edu.au/ling/units/sph302/papers/laver_1980_phonation.pdf},
	Year = {1980},
}

@inproceedings{Traube:2005,
	Author = {Traube, C. and D'Alessandro, N.},
	Booktitle = {Proceedings of the 8th International Conference on Digital Audio Effects (DAFx'05)},
	Keywords = {phonetics, guitar, imitation},
	Pages = {104-109},
	Title = {Vocal Synthesis and Graphical Representation of the Phonetic Gestures Underlying Guitar Timbre Description},
	Url = {http://www.dalessandro.be/research/papers/dafx05.pdf},
	Year = {2005},
	Abstract = {The guitar is an instrument that gives the player great control over 
timbre. Different plucking techniques involve varying the finger 
position along the string, the inclination between the finger and the 
string, the inclination between the hand and the string and the de- 
gree of relaxation of the plucking finger. Guitarists perceive subtle 
variations of these parameters and they have developed a very rich 
vocabulary to describe the brightness, the colour, the shape and 
the texture of the sounds they produce on their instrument. Dark, 
bright, chocolatey, transparent, muddy, wooly, glassy, buttery, and 
metallic are just a few of those adjectives. The aim of this research 
is to conceive a computer tool producing the synthesis of the vocal 
imitation as well as the graphical representation of phonetic ges- 
tures underlying the description of the timbre of the classical gui- 
tar, as a function of the instrumental gesture parameters (mainly 
the plucking angle and distance from the bridge) and based on per- 
ceptual analogies between guitar and speech sounds. Similarly to 
the traditional teaching of tabla which uses onomatopeia to desig- 
nate the different strokes, vocal imitation of guitar timbres could 
provide a common language to guitar performers, complementary 
to the mental imagery they commonly use to communicate about 
timbre, in a pedagogical context for example. },
}

@inproceedings{DAlessandro:2006,
	Author = {D'Alessandro, C. and D'Alessandro, N. and Doval, B. and Le Beux, S.},
	Booktitle = {Proceedings of the 5th International Conference on Voice Physiology and Biomechanics (ICVPB'06)},
	Title = {Comparing Time-Domain and Spectral-Domain Voice Source Models for Gestural Controlled Voice Instruments},
	Url = {http://www.dalessandro.be/research/papers/icvpb06.pdf},
	Year = {2006},
}

@inproceedings{DAlessandro:2006b,
	Author = {D'Alessandro, N. and Doval, B. and Le Beux, S. and Woodruff, P. and Fabre, Y.},
	Booktitle = {Proceedings of the 2nd eNTERFACE Summer Workshop on Multimodal Interfaces (eNTERFACE'06)},
	Pages = {81-90},
	Title = {RAMCESS: Realtime and Accurate Musical Control of Expression in Singing Synthesis},
	Url = {http://www.dalessandro.be/research/papers/enterface06.pdf},
	Year = {2006},
	Abstract = {The main purpose of this project is to develop a full computer-based musical instrument allowing realtime synthesis of expressive singing voice. The expression will result from the continuous action of an interpreter through a gestural control interface. That gestural parameters will influence the caracteristics thanks to particular mapping strategies. 
},
}

@inproceedings{Herre:2003,
	Author = {Herre, J. and Allamanche, E. and Erie, C.},
	Booktitle = {Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA-2003)},
	Month = {October},
	Pages = {83--86},
	Title = {HOW SIMILAR DO SONGS SOUND? TOWARDS MODELING HUMAN PERCEPTION OF MUSICAL SIMILARITY},
	Url = {http://ieeexplore.ieee.org/iel5/9038/28686/01285825.pdf?arnumber=1285825},
	Year = {2003},
	Abstract = {Human listeners have a well-developed feeling to identify 
``whether two songs sound similar'' or whether they don't. 
Even though this type of judgment usually also involves a 
considerable amount of the listener's background 
knowledge, it has been demonstrated that an algorithmic 
model of this type of similarity can be achieved by merely 
evaluating the signal's low-level acoustic features. This 
paper describes a system for assessing subjective sound 
similarity between pairs of musical items by using a number 
of such signal features. The system's performance is assessed 
by means of a subjective listening test that is based on a 
modification of a test methodology originally Standardized 
for subjective sound quality evaluation. A number of 
interesting applications for such a technology are described.},
}

@article{Oohashia:2006,
	Author = {Oohashi, T. and Kawai, N. and Nishina, E. and Honda, M. and Yagi, R. and Nakamura, S. and Morimoto, M. and Maekawa, T. and Yonekura, Y. and Shibasaki, H.},
	Doi = {10.1016/j.brainres.2005.12.096},
	Journal = {Brain Research},
	Keywords = {Auditory; Nonauditory; Vibration; Electroencephalogram; Comfortable listening level; Body surface},
	Month = {February},
	Pages = {339--347},
	Title = {The role of biological system other than auditory air-conduction in the emergence of the hypersonic effect},
	Volume = {1073},
	Year = {2006},
	Abstract = {Although human beings cannot perceive elastic vibrations in the frequency range above 20 kHz, nonstationary sounds containing a wealth of inaudible high-frequency components (HFC) above the human audible range activate deep-lying brain structures, including the brainstem and thalamus and evoke various physiological, psychological, and behavioral responses. In the previous reports, we have called these phenomena collectively ``the hypersonic effect.'' It remains unclear, however, if vibratory stimuli above the audible range are transduced and perceived solely via the conventional air-conducting auditory system or if other mechanisms also contribute to mediate transduction and perception. In the present study, we have examined the emergence of the hypersonic effect when inaudible HFC and audible low-frequency components (LFC) were presented selectively to the ears, the entrance of an air-conducting auditory system, or to the body surface including the head which might contain some unknown vibratory sensing mechanisms. We used two independent measurements based on differing principles; one physiological (alpha 2 frequency of spontaneous electroencephalogram [alpha-EEG]) and the other behavioral (the comfortable listening level [CLL]). Only when the listener's entire body surface was exposed to HFC, but not when HFC was presented exclusively to the air-conducting auditory system, did both the alpha-EEG and the CLL significantly increase compared to the presentation of LFC alone, that is to say, there was an evident emergence of the hypersonic effect. The present findings suggest that the conventional air-conducting auditory system alone does not bring about the hypersonic effect. We may need to consider the possible involvement of a biological system distinct from the conventional air-conducting auditory nervous system in sensing and transducing high-frequency elastic vibration above the human audible range.},
}

@article{oohashi2000ihf,
	Author = {Oohashi, T. and Nishina, E. and Honda, M. and Yonekura, Y. and Fuwamoto, Y. and Kawai, N. and Maekawa, T. and Nakamura, S. and Fukuyama, H. and Shibasaki, H.},
	Journal = {Journal of Neurophysiology},
	Number = {6},
	Pages = {3548--3558},
	Publisher = {Am Physiological Soc},
	Title = {Inaudible High-Frequency Sounds Affect Brain Activity: Hypersonic Effect},
	Volume = {83},
	Year = {2000},
	Abstract = {Although it is generally accepted that humans cannot perceive sounds in the frequency range above 20 kHz, the question of whether the existence of such ``inaudible'' high-frequency components may affect the acoustic perception of audible sounds remains unanswered. In this study, we used noninvasive physiological measurements of brain responses to provide evidence that sounds containing high-frequency components (HFCs) above the audible range significantly affect the brain activity of listeners. We used the gamelan music of Bali, which is extremely rich in HFCs with a nonstationary structure, as a natural sound source, dividing it into two components: an audible low-frequency component (LFC) below 22 kHz and an HFC above 22 kHz. Brain electrical activity and regional cerebral blood flow (rCBF) were measured as markers of neuronal activity while subjects were exposed to sounds with various combinations of LFCs and HFCs. None of the subjects recognized the HFC as sound when it was presented alone. Nevertheless, the power spectra of the alpha frequency range of the spontaneous electroencephalogram (alpha-EEG) recorded from the occipital region increased with statistical significance when the subjects were exposed to sound containing both an HFC and an LFC, compared with an otherwise identical sound from which the HFC was removed (i.e., LFC alone). In contrast, compared with the baseline, no enhancement of alpha-EEG was evident when either an HFC or an LFC was presented separately. Positron emission tomography measurements revealed that, when an HFC and an LFC were presented together, the rCBF in the brain stem and the left thalamus increased significantly compared with a sound lacking the HFC above 22 kHz but that was otherwise identical. Simultaneous EEG measurements showed that the power of occipital alpha-EEGs correlated significantly with the rCBF in the left thalamus. Psychological evaluation indicated that the subjects felt the sound containing an HFC to be more pleasant than the same sound lacking an HFC. These results suggest the existence of a previously unrecognized response to complex sound containing particular types of high frequencies above the audible range. We term this phenomenon the ``hypersonic effect.''},
}

@article{bilsen2006rpg,
	Author = {Bilsen, F.A.},
	Doi = {10.1121/1.2213570},
	Journal = {The Journal of the Acoustical Society of America},
	Month = {August},
	Number = {2},
	Pages = {594},
	Publisher = {ASA},
	Title = {Repetition Pitch glide from the step pyramid at {C}hichen {I}tza},
	Volume = {120},
	Year = {2006},
	Abstract = {Standing at the foot of the Mayan step pyramid at Chichen Itza in Mexico, one can produce a pitchy ``chirp'' echo by handclapping. As exposed by Declercq et al. [J. Acoust. Soc. Am. 116, 3328--3335 (2004)], an acoustic model based on optical Bragg diffraction at a periodic structure cannot explain satisfactorily the chirp-echo sonogram. Alternatively, considering the echo as a sequence of reflections, and given the dimensions of the pyramid and source-receiver position, the chirp is predicted correctly as a Repetition Pitch glide of which the pitch height is continuously decreasing within 177  ms from 796 to 471  Hz-equivalent. },
}

@article{Assmann:2000,
	Author = {Assmann, Peter F. and Katz, William F.},
	Doi = {10.1121/1.1289363},
	Journal = {The Journal of the Acoustical Society of America},
	Month = {October},
	Number = {4},
	Pages = {1856-1866},
	Title = {Time-varying spectral change in the vowels of children and adults},
	Volume = {108},
	Year = {2000},
	Abstract = {Recent studies have shown that time-varying changes in formant pattern contribute to the phonetic specification of vowels. This variation could be especially important in children's vowels, because children have higher fundamental frequencies (f0's) than adults, and formant-frequency estimation is generally less reliable when f0 is high. To investigate the contribution of time-varying changes in formant pattern to the identification of children's vowels, three experiments were carried out with natural and synthesized versions of 12 American English vowels spoken by children (ages 7, 5, and 3 years) as well as adult males and females. Experiment 1 showed that (i) vowels generated with a cascade formant synthesizer (with hand-tracked formants) were less accurately identified than natural versions; and (ii) vowels synthesized with steady-state formant frequencies were harder to identify than those which preserved the natural variation in formant pattern over time. The decline in intelligibility was similar across talker groups, and there was no evidence that formant movement plays a greater role in children's vowels compared to adults. Experiment 2 replicated these findings using a semi-automatic formant-tracking algorithm. Experiment 3 showed that the effects of formant movement were the same for vowels synthesized with noise excitation (as in whispered speech) and pulsed excitation (as in voiced speech), although, on average, the whispered vowels were less accurately identified than their voiced counterparts. Taken together, the results indicate that the cues provided by changes in the formant frequencies over time contribute materially to the intelligibility of vowels produced by children and adults, but these time-varying formant frequency cues do not interact with properties of the voicing source. },
}

@inproceedings{Kelleher:2005,
	Author = {Kelleher, A. and Fitzgerald, D. and Gainza, M. and Coyle, E. and Lawlor, B.},
	Booktitle = {Proc AES 118th Convention},
	Title = {Onset Detection, Music Transcription and Ornament Detection for the Traditional Irish Fiddle},
	Year = {2005},
	Abstract = {By combining techniques used in previous onset detectors, a system that detects note onsets in traditional Irish fiddle tunes has been implemented. The notes detected also include the most common types of ornamentation played by the fiddle. Ornaments are notes of extremely short duration, at most a fifth the length of a regular note. A Short Time Fourier Transform based sub-band technique, which previously gave good results for the Irish tin whistle, was modified to include a threshold approximation more suitable for the fiddle. This system has been tested on a database of real recorded fiddle tunes and good results have been achieved.},
}

@inproceedings{Zolfaghari:1997,
	Author = {Zolfaghari, P. and Robinson, T.},
	Booktitle = {Acoustics, Speech, and Signal Processing, 1997. ICASSP-97., 1997 IEEE International Conference on},
	Doi = {10.1109/ICASSP.1997.596253},
	Month = {April},
	Pages = {1575 - 1578},
	Title = {A formant vocoder based on mixtures of Gaussians},
	Url = {http://ieeexplore.ieee.org/iel3/4635/13030/00596253.pdf?tp=&arnumber=596253&isnumber=13030},
	Volume = {2},
	Year = {1997},
	Abstract = {This paper describes a new low bit-rate formant vocoder. The formant parameters are represented by Gaussian mixture distributions, which are estimated from the discrete Fourier transform (DFT) magnitude spectrum of the speech signal. A voiced/unvoiced classification mechanism has been developed based on the harmonic nature of each formant in the DFT spectrum modulated by the Gaussian mixture distribution. Using a magnitude-only sinusoidal synthesiser, intelligible synthetic speech has been obtained. Vector quantisation of the vocal tract parameters enables this formant vocoder to operate at a bit-rate of 1248 bps},
}

@book{patterson2003caq,
	Author = {Patterson, D. A. and Hennessy, J. L. and Goldberg, D. C. O. N. and Asanovic, K. C. O. N.},
	Edition = {3rd},
	Publisher = {Morgan Kaufmann},
	Title = {Computer Architecture: a quantitative approach},
	Year = {2003}}

@article{Goldberg:1991,
	Author = {Goldberg, D.},
	Journal = {Computing Surveys},
	Month = {March},
	Title = {What Every Computer Scientist Should Know About Floating-Point Arithmetic},
	Url = {http://docs.sun.com/source/806-3568/ncg_goldberg.html},
	Year = {1991},
	Abstract = {Floating-point arithmetic is considered an esoteric subject by many people. This is rather surprising because floating-point is ubiquitous in computer systems. Almost every language has a floating-point datatype; computers from PCs to supercomputers have floating-point accelerators; most compilers will be called upon to compile floating-point algorithms from time to time; and virtually every operating system must respond to floating-point exceptions such as overflow. This paper presents a tutorial on those aspects of floating-point that have a direct impact on designers of computer systems. It begins with background on floating-point representation and rounding error, continues with a discussion of the IEEE floating-point standard, and concludes with numerous examples of how computer builders can better support floating-point.},
}

@misc{vicfirthflam,
	Author = {{Vic Firth Education Team} and {The Percussive Arts Society}},
	Howpublished = {\url{www.vicfirth.com/education/rudiments/20flam.html}, retrieved 30th March 2007},
	Title = {40 Essential Rudiments: The Flam},
	Url = {http://www.vicfirth.com/education/rudiments/20flam.html},
	Year = {2007},
}

@misc{mirex06AODresults,
	Author = {{IMIRSEL}},
	Howpublished = {\url{http://www.music-ir.org/mirex2006/index.php/Audio_Onset_Detection_Results}, retrieved 30th March 2007},
	Lastchecked = {30th March 2007},
	Title = {{MIREX} 2006 {A}udio {O}nset {D}etection {R}esults},
	Url = {http://www.music-ir.org/mirex2006/index.php/Audio_Onset_Detection_Results},
	Urldate = {30th March 2007},
	Year = {2006},
}

@inproceedings{Klapuri:1999,
	Author = {Klapuri, A.},
	Booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
	Pages = {3089--3092},
	Title = {Sound Onset Detection by Applying Psychoacoustic Knowledge},
	Url = {http://www.cs.tut.fi/sgn/arg/music/icassp99.pdf},
	Volume = {6},
	Year = {1999},
	Abstract = {A system was designed, which is able to detect the perceptual onsets of sounds in acoustic signals. The system is general in regard to the sounds involved and was found to be robust for different kinds of signals. This was achieved without assuming regularities in the positions of the onsets. In this paper, a method is first proposed that can determine the beginnings of sounds that exhibit onset imperfections, i.e., the amplitude envelope of which does not rise monotonically. Then the mentioned system is described, which utilizes band-wise processing and a psychoacoustic model of intensity coding to combine the results from the separate frequency bands. The performance of the system was validated by applying it to the detection of onsets in musical signals that ranged from rock to classical and big band recordings.},
}

@article{Lacoste:2007,
	Author = {Lacoste, A. and Eck, D.},
	Doi = {10.1155/2007/43745},
	Journal = {EURASIP Journal on Applied Signal Processing},
	Month = {August},
	Title = {A Supervised Classification Algorithm For Note Onset Detection},
	Url = {http://www.iro.umontreal.ca/~eckdoug/papers/2006_eurasip_draft.pdf},
	Year = {2007},
}

@article{Davy:2006,
	Author = {Davy, M. and Desobry, F. and Gretton, A. and Doncarli, C.},
	Journal = {Signal Processing},
	Keywords = {Abnormality detection, Support Vector Machines, Sequential optimization, Gearbox fault detection, Audio thump detection.},
	Month = {August},
	Number = {8},
	Pages = {2009-2025},
	Title = {An Online {S}upport {V}ector {M}achine for Abnormal Events Detection},
	Url = {http://www-lagis.univ-lille1.fr/~davy/papers/Davy-SigProc-2005.pdf},
	Volume = {86},
	Year = {2006},
	Abstract = {The ability to detect online abnormal events in signals is essential in many real- world Signal Processing applications. Previous algorithms require an explicit signal statistical model, and interpret abnormal events as statistical model abrupt changes. Corresponding implementation relies on maximum likelihood or on Bayes estima- tion theory with generally excellent performance. However, there are numerous cases where a robust and tractable model cannot be obtained, and model-free approaches need to be considered. In this paper, we investigate a machine learning, descriptor- based approach that does not require an explicit descriptors statistical model, based on Support Vector novelty detection. A sequential optimization algorithm is intro- duced. Theoretical considerations as well as simulations on real signals demonstrate its practical efficiency.},
}

@article{Davy:2002,
	Author = {Davy, M. and Gretton, A. and Doucet, A. and Rayner, P.W.J.},
	Doi = {10.1109/LSP.2002.806070},
	Journal = {IEEE Signal Processing letters},
	Keywords = {Model selection, optimized time-frequency representations, signal classification, support vector machines},
	Month = {December},
	Number = {12},
	Pages = {442-445},
	Title = {Optimised {S}upport {V}ector {M}achines for Nonstationary Signal Classification},
	Url = {http://www-lagis.univ-lille1.fr/~davy/papers/Davy_letters_02.pdf},
	Volume = {9},
	Year = {2002},
	Abstract = {This letter describes an efficient method to perform 
nonstationary signal classification. A support vector machine 
(SVM) algorithm is introduced and its parameters optimized in a 
principled way. Simulations demonstrate that our low-complexity 
method outperforms state-of-the-art nonstationary signal classifi- 
cation techniques.},
}

@inproceedings{daudet1tmp,
	Author = {Daudet, L.},
	Booktitle = {Proc. International Computer Music Conference (ICMC'01)},
	Pages = {18--21},
	Title = {Transients modeling by pruned wavelet trees},
	Year = {2001}}

@article{Olkkonen:2007,
	Author = {Olkkonen, H. and Olkkonen, J. T. and Pesola, P.},
	Doi = {10.1109/LSP.2006.879983},
	Journal = {Signal Processing Letters, IEEE},
	Month = {March},
	Number = {3},
	Pages = {177 - 180},
	Title = {FFT-Based Computation of Shift Invariant Analytic Wavelet Transform},
	Url = {http://ieeexplore.ieee.org/iel5/97/4100639/04100655.pdf?tp=&arnumber=4100655&isnumber=4100639},
	Volume = {14},
	Year = {2007},
}

@inproceedings{johnson2006tiu,
	Author = {Johnson, C.G. and Gounaropoulos, A.},
	Booktitle = {Proceedings of {N}ew {I}nterfaces for {M}usical {E}xpression ({NIME})},
	Pages = {101--102},
	Publisher = {IRCAM---Centre Pompidou Paris, France, France},
	Title = {Timbre interfaces using adjectives and adverbs},
	Year = {2006},
	Abstract = {How can we provide interfaces to synthesis algorithms that will allow us to manipulate timbre directly, using the same timbre-words that are used by human musicians to communicate about timbre? This paper describes ongoing work that uses machine learning methods (principally genetic algorithms and neural networks) to learn (1) to recognise timbral characteristics of sound and (2) to adjust timbral characteristics of existing synthesized sounds.},
}

@article{Peltola:2007,
	Author = {Peltola, L. and Erkut, C. and Cook, P. R. and Valimaki, V.},
	Isbn = {1558-7916},
	Journal = {Audio, Speech and Language Processing, IEEE Transactions on {$[$}see also Speech and Audio Processing, IEEE Transactions on{$]$}},
	Journal1 = {Audio, Speech and Language Processing, IEEE Transactions on {$[$}see also Speech and Audio Processing, IEEE Transactions on{$]$}},
	Keywords = {Acoustic resonator filters; acoustic signal processing; control systems; emotions; signal synthesis},
	Number = {3},
	Pages = {1021--1029},
	Title = {Synthesis of Hand Clapping Sounds},
	Ty = {JOUR},
	Volume = {15},
	Year = {2007},
	Abstract = {We present two physics-based analysis, synthesis, and control systems for synthesizing hand clapping sounds. They both rely on the separation of the sound synthesis and event generation, and both are capable of producing individual hand-claps, or mimicking the asynchronous/synchronized applause of a group of clappers. The synthesis models consist of resonator filters, whose coefficients are derived from experimental measurements. The difference between these systems is mainly in the statistical event generation. While the first system allows an efficient parametric synthesis of large audiences, as well as flocking and synchronization by simple rules, the second one provides parametric extensions for synthesis of various clapping styles and enhanced control strategies. The synthesis and the control models of both systems are implemented as software running in real time at the audio sample rate, and they are available for download at at http://ccrma-www.stanford.edu/software/stk and http://www.acoustics.hut.fi/go/clapd. },
}

@article{Ruinskiy:2007,
	Author = {Ruinskiy, D. and Lavner, Y.},
	Isbn = {1558-7916},
	Journal = {Audio, Speech and Language Processing, IEEE Transactions on {$[$}see also Speech and Audio Processing, IEEE Transactions on{$]$}},
	Journal1 = {Audio, Speech and Language Processing, IEEE Transactions on {$[$}see also Speech and Audio Processing, IEEE Transactions on{$]$}},
	Keywords = {Breath detection; event spotting in speech and audio; mel frequency cepstral coefficient (MFCC)},
	Number = {3},
	Pages = {838--850},
	Title = {An Effective Algorithm for Automatic Detection and Exact Demarcation of Breath Sounds in Speech and Song Signals},
	Ty = {JOUR},
	Volume = {15},
	Year = {2007},
	Abstract = {<para> Automatic detection of predefined events in speech and audio signals is a challenging and promising subject in signal processing. One important application of such detection is removal or suppression of unwanted sounds in audio recordings, for instance in the professional music industry, where the demand for quality is very high. Breath sounds, which are present in most song recordings and often degrade the aesthetic quality of the voice, are an example of such unwanted sounds. Another example is bad pronunciation of certain phonemes. In this paper, we present an automatic algorithm for accurate detection of breaths in speech or song signals. The algorithm is based on a template matching approach, and consists of three phases. In the first phase, a template is constructed from mel frequency cepstral coefficients (MFCCs) matrices of several breath examples and their singular value decompositions, to capture the characteristics of a typical breath event. Next, in the initial processing phase, each short-time frame is compared to the breath template, and marked as breathy or nonbreathy according to predefined thresholds. Finally, an edge detection algorithm, based on various time-domain and frequency-domain parameters, is applied to demarcate the exact boundaries of each breath event and to eliminate possible false detections. Evaluation of the algorithm on a database of speech and songs containing several hundred breath sounds yielded a correct identification rate of 98{\%} with a specificity of 96{\%}. </para>},
}

@article{Resch:2007,
	Author = {Resch, B. and Nilsson, M. and Ekman, A. and Kleijn, W. B.},
	Isbn = {1558-7916},
	Journal = {Audio, Speech and Language Processing, IEEE Transactions on {$[$}see also Speech and Audio Processing, IEEE Transactions on{$]$}},
	Journal1 = {Audio, Speech and Language Processing, IEEE Transactions on {$[$}see also Speech and Audio Processing, IEEE Transactions on{$]$}},
	Keywords = {Instantaneous pitch; pitch estimation; pitch- synchronous processing; splines},
	Number = {3},
	Pages = {813--822},
	Title = {Estimation of the Instantaneous Pitch of Speech},
	Ty = {JOUR},
	Volume = {15},
	Year = {2007},
	Abstract = {An accurate estimation of the pitch is essential for many speech processing applications, such as speech synthesis, speech coding, and speech enhancement. A widely used assumption in most common pitch estimation methods is that pitch is constant over a segment of short duration. This assumption does not apply in reality and leads to inaccurate pitch estimates. In this paper, we present a method for continuous pitch estimation that is able to track fast changes. In the presented framework, the pitch is modeled by a B-spline expansion and optimized in a multistage procedure for increased robustness. The performance of the continuous optimization procedure is compared to state-of-the-art pitch estimation methods and is evaluated both for artificial speech-like signals with known pitch, and for real speech signals. The results of the experiments show that our method leads to a higher accuracy of the estimate of the pitch than state-of-the-art methods.},
}

@article{Luo:2006,
	Author = {Luo, A.C.J. and Gegg, B.C.},
	Doi = {10.1142/S0218127406016975},
	Journal = {International Journal of Bifurcation and Chaos},
	Keywords = {Discontinuous dynamical system; grazing bifurcation; stick (or sliding) bifurcation; friction-induced oscillator},
	Number = {12},
	Pages = {3539-3566},
	Title = {DYNAMICS OF A HARMONICALLY EXCITED OSCILLATOR WITH DRY-FRICTION ON A SINUSOIDALLY TIME-VARYING, TRAVELING SURFACE},
	Url = {http://www.worldscinet.com/ijbc/16/preserved-docs/1612/S0218127406016975.pdf},
	Volume = {16},
	Year = {2006},
	Abstract = {In this paper, periodic motion in an oscillator moving on the periodically traveling belts with dry friction is investigated. The conditions of stick and nonstick motions for such an oscillator are obtained in the relative motion frame, and the grazing and stick (or sliding) bifurcations are presented as well. The periodic motions of such an oscillator are predicted analytically and numerically, and the analytical prediction is based on the appropriate mapping structures. The local stability and bifurcation for such periodic motions are obtained. The periodic motions are illustrated through the displacement, velocity and force responses in absolute and relative frames. This investigation provides an efficient method to predict periodic motions of such an oscillator involving dry-friction. The significance of this investigation lies in controlling motion of such friction-induced oscillator in industry.},
}

@book{Moore:1982,
	Author = {Moore, B. C. J.},
	Edition = {2nd},
	Keywords = {Psychology, psychoacoustics, hearing},
	Publisher = {Academic Press London, UK},
	Title = {An introduction to the psychology of hearing},
	Year = {1982},
}

@article{mccartney2002rcm,
	Author = {McCartney, J.},
	Doi = {10.1162/014892602320991383},
	Journal = {Computer Music Journal},
	Keywords = {SuperCollider},
	Number = {4},
	Pages = {61--68},
	Publisher = {MIT Press},
	Title = {Rethinking the Computer Music Language: {SuperCollider}},
	Url = {http://muse.jhu.edu/journals/computer_music_journal/v026/26.4mccartney.pdf},
	Volume = {26},
	Year = {2002},
	Abstract = {Designing a new computer music language requires 
one to answer certain questions. Some of these 
questions may at first glance seem trivial but on 
further examination are rather deep. The following 
questions fit this category. What is a computer lan- 
guage? What is the difference between a high-level 
and a low-level language? What do current com- 
puter music languages do for you? What should a 
computer music language do? How can computer 
language abstractions be applied to computer mu- 
sic? Is a specialized computer music language even 
necessary? This article discusses the above ques- 
tions, how they led to creating the SuperCollider 
language, and some current development and fur- 
ther directions for SuperCollider. 
},
}

@inproceedings{orlarey:dpf,
	Author = {ORLAREY, Y. and and Albert, G. and Kersten, S.},
	Booktitle = {Proc. International Computer Music Conference (ICMC'06)},
	Keywords = {SuperCollider, Q, FAUST},
	Title = {{DSP} Programming with {F}aust, {Q} and {SuperCollider}},
	Url = {http://q-lang.sourceforge.net/icmc06/icmc06.pdf},
	Year = {2006},
	Abstract = {Faust is a functional programming language for real-time sig- 
nal processing and synthesis that targets high-performance 
signal processing applications and audio plugins. The paper 
gives a brief introduction to Faust and discusses its interfaces 
to Q, a general-purpose functional programming language, 
and SuperCollider, an object-oriented sound synthesis lan- 
guage and engine. 
},
}

@inproceedings{hainsworth:tfr,
	Author = {Hainsworth, S.W. and Wolfe, P.J.},
	Journal = {Proc. International Computer Music Conference},
	Pages = {14--17},
	Title = {Time-Frequency Reassignment for Music Analysis},
	Url = {http://people.deas.harvard.edu/~patrick/publications/hainsworth_icmc_01.pdf},
	Abstract = {Time-frequency reassignment may be viewed as a refinement 
of the short-time Fourier transform, in which phase informa- 
tion is used to reduce the smearing of energy associated with 
the standard spectrogram. However, even given the percepti- 
bly clearer visual representation yielded by the reassignment 
method in the case of musical signals, the task remains of ex- 
tracting useful information from it for further processing. To 
this end it is proposed that time reassignment information be 
used to help identify musical transients, and that frequency 
reassignment information be similarly employed as a means 
of estimating the pitch of musical signal components. To il- 
lustrate these ideas, an example is shown in which reassigned 
time and frequency points are used to segment a monophonic 
piano melody and locate the partials of its individual notes. 
Lastly, the potential role of reassignment in the overall frame- 
work of music transcription is described, and several areas 
are detailed for future study. 
},
}

@article{Gu:2003,
	Author = {Gu, Fanji and Meng, Xin and Shen, Enhua and Cai, Zhijie},
	Doi = {doi:10.1142/S0218127403006893},
	Journal = {International Journal of Bifurcation and Chaos},
	Keywords = {Complexity; mutual information; consciousness},
	Number = {3},
	Pages = {733-742},
	Title = {CAN WE MEASURE CONSCIOUSNESS WITH EEG COMPLEXITIES?},
	Url = {http://ejournals.worldscientific.com.sg/ijbc/13/1303/S0218127403006893.html},
	Volume = {13},
	Year = {2003},
	Abstract = {Several complexity measures, especially approximate entropy (ApEn) and a new defined complexity measure ￼, of EEG signals or the ones of the mutual information transmission between different channels of EEGs were calculated to distinguish different consciousness levels for different brain functional states. All of the measures decreased with the following order of brain states: rest with eyes open, eyes closed, light sleep and deep sleep. They decreased during epileptic seizures. On the contrary, the averaged mutual information between different channels increased significantly during the epileptic seizure; there is no significant difference among the averaged mutual information for the subject resting with eyes open, closed, being in light sleep and in deep sleep. Thus, the former indexes seem to be promising candidates to characterize different consciousness levels, while the latter seems not.
},
}

@article{Kotnik:2003,
	Author = {Kotnik, B. and Vlaj, D. and Horvat, B.},
	Journal = {International Journal of Speech Technology},
	Keywords = {noise robustness, distributed speech recognition, frame-attenuation, spectral substitution, feature extraction},
	Pages = {205--219},
	Title = {Efficient noise robust feature extraction algorithms for {D}istributed {S}peech {R}ecognition ({DSR}) systems},
	Volume = {6},
	Year = {2003},
}

@article{Mak:2005,
	Author = {Mak, M-W. and Sit, C-H. and Kung, S-Y.},
	Journal = {International Journal of Speech Technology},
	Keywords = {distributed speaker verification, DSR, DSR front-end processing, feature transformation},
	Pages = {67--77},
	Title = {Extraction of speaker features from different stages of {DSR} front-ends for distributed speaker verification},
	Url = {http://www.eie.polyu.edu.hk/~mwmak/papers/ijst04.pdf},
	Volume = {8},
	Year = {2005},
	Abstract = {The ETSI has recently published a front-end processing standard for distributed speech 
recognition systems. The key idea of the standard is to extract the spectral features of 
speech signals at the front-end terminals so that acoustic distortion caused by communication 
channels can be avoided. This paper investigates the effect of extracting spectral features 
from different stages of the front-end processing on the performance of distributed speaker 
verification systems. A technique that combines handset selectors with stochastic feature 
transformation is also employed in a back-end speaker verification system to reduce the 
acoustic mismatch between different handsets. Because the feature vectors obtained from 
the back-end server are vector quantized, the paper proposes two approaches to adding 
Gaussian noise to the quantized feature vectors for training the Gaussian mixture speaker 
models. In one approach, the variances of the Gaussian noise are made dependent on the 
codeword distance. In another approach, the variances are a function of the distance between 
some unquantized training vectors and their closest code vector. The HTIMIT corpus was 
used in the experiments and results based on 150 speakers show that stochastic feature 
transformation can be added to the back-end server for compensating transducer distortion. 
It is also found that better verification performance can be achieved when the LMS-based 
blind equalization in the standard is replaced by stochastic feature transformation. 
},
}

@article{Rentzos:2005,
	Author = {Rentzos, D. and Vaseghi, S. and Yan, Q.},
	Journal = {International Journal of Speech Technology},
	Keywords = {voice conversion, morphing, formant, HMMs},
	Pages = {227--245},
	Title = {Parametric formant modelling and transformation in voice conversion},
	Volume = {8},
	Year = {2005},
}

@article{Zhao:1999,
	Author = {Zhao, W.W. and Ogunfunmi, T.},
	Journal = {International Journal of Speech Technology},
	Keywords = {Wigner-Ville Distribution, speech signals, formant and pitch estimation, time-frequency distribution},
	Pages = {35--49},
	Title = {Formant and pitch detection using time-frequency distribution},
	Volume = {3},
	Year = {1999},
}

@book{rabiner1978dps,
	Author = {Rabiner, L. R. and Schafer, R. W.},
	Publisher = {Prentice-Hall Englewood Cliffs, NJ},
	Title = {Digital processing of speech signals},
	Year = {1978},
}

@inproceedings{Welch:2001,
	Author = {Welch, Gary Bishop Greg},
	Booktitle = {SIGGRAPH 2001},
	Title = {An Introduction to the Kalman Filter},
	Url = {http://www.menem.com/ilya/digital_library/control/welch-bishop-01.pdf},
	Year = {2001},
	Abstract = {The Kalman filter is a mathematical power tool that is playing an increasingly important role in computer graphics as we include sensing of the real world in our systems. The good news is you don't have to be a mathematical genius to understand and effectively use Kalman filters. This tutorial is designed to provide developers of graphical systems with a basic understanding of this important mathematical tool.},
}

@inproceedings{parra2001akf,
	Author = {Parra, L. and Jain, U.},
	Booktitle = {Proc. of IEEE WASPAA},
	Month = {Oct},
	Pages = {75--78},
	Title = {Approximate {K}alman filtering for the harmonic plus noise model},
	Url = {http://newton.bme.columbia.edu/~lparra/publish/waspaa_parra_jain.pdf},
	Year = {2001},
	Abstract = {We present a probabilistic description of the Harmonic plus Noise Model (HNM) for speech signals. This probabilistic formulation permits Maximum Likelihood (ML) parameter estimation and speech synthesis becomes a straightforward sampling from a distribution. It also permits development of a Kalman filter that tracks model parameters such as pitch, harmonic amplitudes, and autoregressive coefficients. We focus here on pitch tracking for which the estimator is highly non-linear. As a result it is necessary to develop an approximate Kalman filter that goes beyond extended Kalman filtering. 
},
}

@inbook{Williams:1999,
	Author = {Williams, C. K. I.},
	Chapter = {Prediction with Gaussian processes: From linear regression to linear prediction and beyond},
	Editor = {Jordan, M. I.},
	Pages = {599-621},
	Publisher = {The MIT Press},
	Title = {Learning in Graphical Models},
	Url = {http://www.dai.ed.ac.uk/homes/ckiw/postscript/NCRG_97_012.ps.gz},
	Year = {1999},
	Abstract = {The main aim of this paper is to provide a tutorial on regression with Gaussian processes. We start from Bayesian linear regression, and show how by a change of viewpoint one can se this method as a Gaussian process predictor based on priors over functions, rather than on priors over parameters. This leads to a more general discussion of Gaussian processes in section 4. Section 5 deals with further issues, including hierarchical modelling and the setting of the parameters that control the Gaussian process, the covariance functions for neural network models and the use of Gaussian processes in classification problems.},
}

@phdthesis{Townsend:1994,
	Author = {Townsend, M.},
	Month = {September},
	School = {King's College, University of London},
	Title = {Analysis of percussive sounds using {L}inear {P}redictive {C}oding},
	Year = {1994},
	Abstract = {This thesis is concerned with the modelling of sounds from percussive instruments, and the detection and observation of changes in the component frequencies of these sounds.

Linear Predictive Coding (LPC) is now widely established as an efficient means of coding human speech signals. This technique often works by assuming the acoustic system generating the signal of interest is resonant, so the progression to modelling resonant musical instruments is a simple one, though some modifications are required. This thesis extends this mainstream knowledge with a wide range of re-synthesis strategies and experimental results, and shows LPC to be a versatile tool in the synthesis of electronic music.

The resonances of the acoustic system are encoded within the LPC model of the sound. However, these resonant frequencies may be extracted individually by further analysis. For this, previous work with speech signals is elaborated upon, resulting in a highly accurate signal processing method and its implementation. Investigation of both synthetic and real world is presented, together with subsequent improvements to the algorithm.

The physical properties of the acoustic source can often cause the resonant frequencies to change in a deterministic manner; this is particularly the case with the stretching of a drum-skin. These movements may be traced as continuous lines in time/frequency space. In image processing the detection of lines and other patterns often find their solution in the Hough Transform (HT). Here, a novel application of this technique and its implementation are described, leading to the use of the HT to predict the trajectories resonant frequencies will take.

The major new contributions of this work are a detailed study of the analysis and synthesis of percussive sounds with high order LPC models; a highly accurate formant extraction algorithm, which has potential applications in any field of signal processing where resonant modes are analysed; and a novel application of the Hough Transform designed to identify patterns within the analysed data.},
}

@article{Rahman:2005,
	Author = {Rahman, M. S. and Shimamura, T.},
	Doi = {10.1250/ast.26.502},
	Journal = {Acoust. Sci. {\&} Tech},
	Keywords = {Linear prediction, Autocorrelation ``aliasing,'' Minimum-phase cepstrum, Liftering, Fundamental frequency effect},
	Month = {April},
	Number = {6},
	Pages = {502--510},
	Title = {Formant frequency estimation of high-pitched speech by homomorphic prediction},
	Url = {http://www.jstage.jst.go.jp/article/ast/26/6/502/_pdf},
	Volume = {26},
	Year = {2005},
	Abstract = {The conventional model of the linear prediction analysis suffers from difficulties in 
estimating vocal tract characteristics of high-pitched speakers. This is because the autocorrelation 
function used by the autocorrelation method of linear prediction for estimating autoregressive 
coefficients is actually an ``aliased'' version of that of the vocal tract impulse response. This ``aliasing'' 
occurs due to the periodic nature of voiced speech. Generally it is accepted that homomorphic filtering 
can be used to obtain an estimate of vocal tract impulse response which is free from periodicity. Thus 
linear prediction of the resulting vocal tract impulse response (referred to as homomorphic prediction) 
is expected to be free from variations of fundamental frequencies. To our knowledge any experimental 
study, however, has not yet appeared on the suitability of this method for analyzing high-pitched 
speech. This paper presents a detail study on the prospects of homomorphic prediction as a formant 
tracking tool especially for high-pitched speech where linear prediction fails to obtain accurate 
estimation. The formant frequencies estimated using the proposed method are found to be accurate by 
more than an order of magnitude compared to the conventional procedure. The accuracy of formant 
estimation is verified on synthetic vowels for a wide range of pitch periods covering typical male and 
high-pitched female speakers. The validity of the proposed method is also examined by inspecting the 
spectral envelopes of natural speech spoken by high-pitched female speakers. We noticed that almost 
all the previous methods dealing with this limitation of linear prediction are based on the covariance 
technique where the obtained AR filter can be unstable. The solutions obtained by the current method 
are guaranteed to be stable which makes it superior for many speech analysis applications. 
},
}

@inproceedings{Laprie:2004,
	Author = {Laprie, Y.},
	Booktitle = {Proc InterSpeech-ICSLP},
	Keywords = {speech, formant tracking, speech analysis},
	Title = {A concurrent curve strategy for formant tracking},
	Year = {2004},
	Abstract = {Although automatic formant tracking has a wide range of po- 
tential applications it is still an open problem. We previously 
proposed the use of active curves that deform under the influ- 
ence of the spectrogram energy. Each formant was tracked in- 
dependently and a complex strategy was required to guarantee 
the overall formant tracking consistency. This paper describes 
how the interdependency between formants can be incorporated 
directly during the deformations of formant tracks. Iterative 
processes attached to each formant are interlaced. We exper- 
imented two strategies. The first consists in partitioning the 
spectrogram into exclusive regions, each region affiliated to a 
given formant. The second consists in adding a repulsion force 
between formants that prevent formant tracks to merge together. 
It turns out that the second strategy is more robust and does not 
necessitate a complex control strategy.},
}

@inproceedings{Kammoun:2004,
	Author = {Kammoun, M. A. and Gargouri, D. and Frikha, M. and Ben Hamida, A.},
	Booktitle = {Industrial Technology, 2004. IEEE ICIT '04. 2004 IEEE International Conference on},
	Doi = {10.1109/ICIT.2004.1490808},
	Pages = {1612 - 1616},
	Title = {Cepstral method evaluation in speech formant frequencies estimation},
	Url = {http://ieeexplore.ieee.org/iel5/9977/32037/01490808.pdf?tp=&arnumber=1490808&isnumber=32037},
	Volume = {3},
	Year = {2004},
	Abstract = {This paper presents a technique for formant estimation using cepstral envelope analysis. The presumed method which computes cepstrum has been implemented with Matlab and was applied to the problem of accurate measurement of formant frequencies. The conceived algorithm picks formant frequencies from the smoothed spectrum. The approach relies on decomposing the speech signal into two components: the first component presents the excitation, while the second component is intended to present vocal tract resonances. Such procedure was then achieved by applying the homomorphic deconvolution to the treated speech signal. The obtained result, i.e the cepstrum, was then used to estimate the smoothed spectrum. Formant picking is achieved by localizing the spectral maxima from the smoothed envelope. Results showed that there is a wide range in the estimated values of formant frequencies for male and female speakers. Such cepstral method evaluation confirms the limitation of the use of this technique in the estimation of formant frequencies.},
}

@book{Rabiner:1993,
	Address = {Upper Saddle River, NJ, USA},
	Author = {Rabiner, L. and Juang, B.-H.},
	Isbn = {0-13-015157-2},
	Publisher = {Prentice-Hall, Inc.},
	Title = {Fundamentals of speech recognition},
	Year = {1993},
}

@inproceedings{collins:upd,
	Author = {Collins, N.},
	Booktitle = {Proc. Int. Symposium on Music Information Retrieval (ISMIR)},
	Keywords = {onset detection, pitch},
	Pages = {100--106},
	Title = {USING A PITCH DETECTOR FOR ONSET DETECTION},
	Url = {http://ismir2005.ismir.net/proceedings/1008.pdf},
	Year = {2005},
	Abstract = {A segmentation strategy is explored for monophonic instrumental pitched non-percussive material (PNP) which proceeds from the assertion that human-like event analysis can be founded on a notion of stable pitch percept. A constant-Q pitch detector following the work of Brown and Puckette provides pitch tracks which are post processed in such a way as to identify likely transitions between notes. A core part of this preparation of the pitch detector signal is an algorithm for vibrato suppression. An evaluation task is undertaken on slow attack and high vibrato PNP source files with human annotated onsets, exemplars of a difficult case in monophonic source segmentation. The pitch track onset detection algorithm shows an improvement over the previous best performing algorithm from a recent comparison study of onset detectors. Whilst further timbral cues must play a part in a general solution, the method shows promise as a component of a note event analysis system.},
}

@article{GARTH:1994,
	Author = {GARTH, L. M. and POOR, H. V.},
	Date = {JUL},
	Doi = {10.1109/5.293163},
	Isi = {ISI:A1994NX70700010},
	Journal = {PROCEEDINGS OF THE IEEE},
	Month = {Jul},
	Number = {7},
	Pages = {1061--1095},
	Publication-Type = {J},
	Title = {DETECTION OF NON-GAUSSIAN SIGNALS - A PARADIGM FOR MODERN STATISTICAL SIGNAL-PROCESSING},
	Url = {http://ieeexplore.ieee.org/iel1/5/7241/00293163.pdf?isnumber=7241&prod=JNL&arnumber=293163&arSt=1061&ared=1095&arAuthor=Garth%2C+L.M.%3B+Poor%2C+H.V.},
	Volume = {82},
	Year = {1994},
}

@inproceedings{Vich:2001,
	Author = {Vich, R. and Pribil, J. and Smekan, Z.},
	Journal = {EUROCON'2001, Trends in Communications, International Conference on.},
	Keywords = {cepstral analysis; poles and zeros; speech synthesis; TTS synthesis; antiformants; cepstral models; formants; speech synthesis; vocal tract models; zero-pole models},
	Pages = {459--462 vol.2},
	Title = {New cepstral zero-pole vocal tract models for TTS synthesis},
	Ty = {CONF},
	Url = {http://ieeexplore.ieee.org/iel5/7466/20308/00938161.pdf?isnumber=&arnumber=938161},
	Volume = {2},
	Year = {2001},
	Abstract = {Speech is an analog sound signal produced by exciting the human vocal tract. The magnitude response of the vocal tract exhibits both peaks (formants) and valleys (antiformants). Vocal tract models are differentiated according to whether they model the formants alone (LPC models) or also antiformants (ARMA and cepstral models). New structures are proposed for an effective realization of cepstral vocal tract models that model both formants and antiformants},
}

@inproceedings{Gargouri:2004,
	Author = {Gargouri, D. and Frikha, M. and Laffet, M. W. and ali Kamoun, M. and Ben Hamida, A.},
	Journal = {Industrial Technology, 2004. IEEE ICIT '04. 2004 IEEE International Conference on},
	Keywords = {cepstral analysis; deconvolution; speaker recognition; biomedical application; cepstral analysis; formants frequencies determination; speaker identification; speech based parameterization; speech recognition; speech signal deconvolution; vocal cords},
	Pages = {1298--1302 Vol. 3},
	Title = {Cepstral analysis for formants frequencies determination dedicated to speaker identification},
	Ty = {CONF},
	Url = {http://ieeexplore.ieee.org/iel5/9977/32037/01490748.pdf?isnumber=&arnumber=1490748},
	Volume = {3},
	Year = {2004},
	Abstract = {In this paper, we present a technique of parameterization of the speech based on the cepstral analysis, for the extraction of the first four formants F1, F2, F3 and F4 with the aim of a biomedical application. Indeed, such analysis, supposed linear, assures the speech signal deconvolution. It allows separating the contribution of the vocal tract, i.e. the formants frequencies, and the one of the vocal cords responsible of the fundamental frequency. The technique applied to some vowels extracted from the TIMIT database, allows identifying the variations interlocutors of the formants frequencies according to the sex and of the region. Variability interlocutor is a major phenomenon in speech recognition because a speaker remains recognizable by the timbre of his voice in spite of a variation which can sometimes be significant. Results so obtained allow noticing the variability of the formants frequencies of a vowel pronounced by various speakers. So, several scenarios were tested to know: 1) a vowel pronounced by four men and four women who lived in the same region, 2) a vowel pronounced by four women of the same region, and 3) a vowel pronounced by eight men who lived in different regions.},
}

@article{OShaughnessy:2003,
	Author = {O'Shaughnessy, D.},
	Date = {SEP},
	Doi = {10.1109/JPROC.2003.817117},
	Isi = {ISI:000185262500001},
	Journal = {Proceedings of the IEEE},
	Month = {Sep},
	Number = {9},
	Pages = {1272--1305},
	Publication-Type = {J},
	Title = {Interacting with computers by voice: Automatic speech recognition and synthesis},
	Volume = {91},
	Year = {2003},
	Abstract = {This paper examines how people communicate with computers 
using speech. Automatic speech recognition (ASR) transforms 
speech into text, while automatic speech synthesis [or text-to-speech 
(TTS)] performs the reverse task. ASR has largely developed based 
on speech coding theory, while simulating certain spectral analyses 
performed by the ear. Typically, a Fourier transform is employed, 
but following the auditory Bark scale and simplifying the spectral 
representation with a decorrelation into cepstral coefficients. 
Current ASR provides good accuracy and performance on limited 
practical tasks, but exploits only the most rudimentary knowledge 
about human production and perception phenomena. The popular 
mathematical model called the hidden Markov model (HMM) is 
examined; first-order HMMs are efficient but ignore long-range 
correlations in actual speech. Common language models use a 
time window of three successive words in their syntactic--semantic 
analysis. 

Speech synthesis is the automatic generation of a speech wave- 
form, typically from an input text. As with ASR, TTS starts from a 
database of information previously established by analysis of much 
training data, both speech and text. Previously analyzed speech is 
stored in small units in the database, for concatenation in the proper 
sequence at runtime. TTS systems first perform text processing, 
including ``letter-to-sound'' conversion, to generate the phonetic 
transcription. Intonation must be properly specified to approximate 
the naturalness of human speech. Modern synthesizers using large 
databases of stored spectral patterns or waveforms output highly in- 
telligible synthetic speech, but naturalness remains to be improved. },
}

@article{BILMES:2006,
	Author = {BILMES, Jeff A.},
	Doi = {10.1093/ietisy/e89-d.3.869},
	Journal = {IEICE Transactions on Information and Systems},
	Keywords = {automatic speech recognition, hidden Markov models, HMMs, time-series processes, hand-writing recognition, graphical models, dynamic Bayesian networks, dynamic graphical models, stochastic processes, time-series densities, bio-informatics},
	Number = {3},
	Pages = {869-891},
	Title = {What HMMs Can Do},
	Url = {http://ietisy.oxfordjournals.org/cgi/content/abstract/E89-D/3/869},
	Volume = {E89-D},
	Year = {2006},
	Abstract = {Since their inception almost fifty years ago, hidden Markov models (HMMs) have have become the predominant methodology for automatic speech recognition (ASR) systems---today, most state-of-the-art speech systems are HMM-based. There have been a number of ways to explain HMMs and to list their capabilities, each of these ways having both advantages and disadvantages. In an effort to better understand what HMMs can do, this tutorial article analyzes HMMs by exploring a definition of HMMs in terms of random variables and conditional independence assumptions. We prefer this definition as it allows us to reason more throughly about the capabilities of HMMs. In particular, it is possible to deduce that there are, in theory at least, no limitations to the class of probability distributions representable by HMMs. This paper concludes that, in search of a model to supersede the HMM (say for ASR), rather than trying to correct for HMM limitations in the general case, new models should be found based on their potential for better parsimony, computational requirements, and noise insensitivity.},
}

@article{Katagiri:1998,
	Author = {Katagiri, S. and Juang, B. and Lee, C.},
	Doi = {10.1109/5.726793},
	Journal = {Proceedings of the IEEE},
	Keywords = {Bayes methods, decision theory, pattern recognition, probability, speech recognition},
	Number = {11},
	Pages = {2345-2373},
	Title = {Pattern recognition using a family of design algorithms based uponthe generalized probabilistic descent method},
	Url = {http://ieeexplore.ieee.org/iel4/5/15641/00726793.pdf?isnumber=15641&arnumber=726793},
	Volume = {86},
	Year = {1998},
	Abstract = {This paper provides a comprehensive introduction to a novel approach to pattern recognition which is based on the generalized probabilistic descent method (GPD) and its related design algorithms. The paper contains a survey of recent recognizer design techniques, the formulation of GPD, the concept of minimum classification error learning that is closely related to the GPD formalization, a relational analysis between GPD and other important design methods, and various embodiments of GPD-based design, including segmental-GPD, minimum spotting error training, discriminative utterance verification, and discriminative feature extraction. GPD development has its origins in basic pattern recognition and Bayes decision theory. It represents a simple but careful re-investigation of the classical theory and successfully leads to an innovative framework. For clarity of presentation, detailed discussions about its embodiments are provided for examples of speech pattern recognition tasks that use a distance-based classifier. Experimental results in speech pattern recognition tasks clearly demonstrate the remarkable utility of the family of GPD-based design algorithms},
}

@article{Trentin:2001,
	Author = {Trentin, Edmondo and Gori, Marco},
	Journal = {Neurocomputing},
	Keywords = {Speech recognition; Hidden Markov model; Neural network; Hybrid system},
	Month = {April},
	Number = {1-4},
	Pages = {91-126},
	Title = {A survey of hybrid ANN/HMM models for automatic speech recognition},
	Url = {http://www.sciencedirect.com/science/article/B6V10-42FS7GW-5/2/1847a9a0ed5e9cf224490bdf544810fa},
	Volume = {37},
	Year = {2001},
	Abstract = {In spite of the advances accomplished throughout the last decades, automatic speech recognition (ASR) is still a challenging and difficult task. In particular, recognition systems based on hidden Markov models (HMMs) are effective under many circumstances, but do suffer from some major limitations that limit applicability of ASR technology in real-world environments. Attempts were made to overcome these limitations with the adoption of artificial neural networks (ANN) as an alternative paradigm for ASR, but ANN were unsuccessful in dealing with long time-sequences of speech signals. Between the end of the 1980s and the beginning of the 1990s, some researchers began exploring a new research area, by combining HMMs and ANNs within a single, hybrid architecture. The goal in hybrid systems for ASR is to take advantage from the properties of both HMMs and ANNs, improving flexibility and recognition performance. A variety of different architectures and novel training algorithms have been proposed in literature. This paper reviews a number of significant hybrid models for ASR, putting together approaches and techniques from a highly specialistic and non-homogeneous literature. Efforts concentrate on describing and referencing architectures and algorithms, their advantages and limitations, as well as on categorizing them into broad classes. Early attempts to emulate HMMs by ANNs are first described. Then we focus on ANNs to estimate posterior probabilities of the states of an HMM and on 'global' optimization, where a single, overall training criterion is defined over the HMM and the ANNs. Connectionist vector quantization for discrete HMMs, and other more recent approaches are also reviewed. It is pointed out that, in addition to their theoretical interest, hybrid systems have been allowing for tangible improvements in recognition performance over the standard HMMs in difficult and significant benchmark tasks.},
}

@inproceedings{MTGFabi2004:300,
	Address = {Naples, Italy},
	Author = {Fabig, L. and Janer, J.},
	Booktitle = {Proceedings of 7th International Conference on Digital Audio Effects},
	Keywords = {hoarseness, Muscle Tension Dysphonia, pressed phonation},
	Title = {Transforming Singing Voice Expression - The Sweetness Effect},
	Url = {http://www.iua.upf.es/mtg/publications/DAFX04-FabigJaner.pdf},
	Year = {2004},
	Abstract = {We propose a real-time system which is targeted to music production in the context of vocal recordings. The aim is to transform the singer's voice characteristics in order to achieve a sweet sounding voice. It combines three different transformations namely Sub-Harmonic Component Reduction (reduction of sub-harmonics, which are found in voices with vocal disorders), Vocal Tract Excitation Modification (to achieve a change in loudness) and the Intonation Modification (to achieve smoother transitions in pitch). The transformations are done in the frequency domain based on an enhanced phase-locked vocoder. The Expression Adaptive Control estimates the amount of present vocal disorder in the singer's voice. This estimate automatically controls the amount of Sub-Harmonic Component reduction to assure a natural sounding transformation.},
}

@inproceedings{Loscos:2005a,
	Address = {Barcelona},
	Author = {Loscos, A. and Celma, O.},
	Booktitle = {Proceedings of International Computer Music Conference 2005},
	Title = {Larynxophone: Using Voice As A Wind Controller},
	Url = {http://www.music.mcgill.ca/~ich/research/misc/papers/cr1308.pdf},
	Year = {2005},
	Abstract = {In the context of music composition and production using MIDI sequencers, wind instrument tracks are built on the synthesis of music scores that have been written using whether MIDI keyboards or mouse clicks. Such modus operandi clearly handicaps the musician when it comes to shape the resulting audio with the desired expression. This paper presents a straightforward method to create convincing wind instrument audio tracks avoiding intermediate MIDI layers and easing expression control. The method stands on the musician ability to mimic, by singing or humming, the desired wind instrument performance. From this vocal performance, a set of voice features are extracted and used to drive a real-time cross-synthesis between samples of a wind instrument database and the musician's voice signal.},
}

@inproceedings{MTGBona2005:320,
	Address = {Barcelona},
	Author = {Bonada, J.},
	Booktitle = {Proceedings of 118th Audio Engineering Society Convention},
	Title = {F-2 Voice Solo to Unison Choir Transformation},
	Year = {2005},
	Abstract = {In this paper we present a transformation that pretends to convert a voice solo into a large, unison choir. The basic idea behind the presented algorithm is to morph the input voice solo (dry recording) with a recorded sustained vowel of a unison choir. The processing algorithm is based on the rigid phase-locked vocoder adapted to harmonic sounds. Pitch and timbre are taken from the voice solo, and the local spectrum comes out from the analysis of the unison choir sample.}}

@inproceedings{MTGKalt2005:351,
	Address = {Barcelona},
	Author = {Kaltenbrunner, M.},
	Booktitle = {Proc. International Computer Music Conference (ICMC'05)},
	Title = {Interactive Music for Mobile Digital Music Players},
	Url = {http://www.iua.upf.edu/mtg/publications/72befe-icmc2005-mkalten.pdf},
	Year = {2005},
	Abstract = {The iPod and similar mobile digital music players are becoming part of our everyday life, enabling anyone to carry his complete music collection to anywhere. While facilitating the permanent and immediate playback of any desired song, these devices generally lack an important feature: the playful interaction with the music playback and the creation of new music itself. After initial experiments with the port of ucLinux to the iPod, we believe that playful mobile music creation needs to be an integral part of the digital lifestyle device of the future. We managed to port Pure Data (Pd) to the iPod, which now enables us to create interactive musical content for this device.},
}

@inproceedings{MTGJane2005:343,
	Address = {Madrid, Spain},
	Author = {Janer, J. and Loscos, A.},
	Booktitle = {Proceedings of 8th Intl. Conference on Digital Audio Effects},
	Keywords = {voice-to-midi},
	Title = {Morphing techniques for enhanced scat singing},
	Url = {http://www.iua.upf.edu/mtg/publications/622612-DAFX05-aloscos.pdf},
	Year = {2005},
	Abstract = {In jazz, scat singing is a phonetic improvisation that imitates instrumental sounds. In this paper, we propose a system that aims to transform singing voice into real instrument sounds, extending the possibilities for scat singers. Analysis algorithms in the spectral domain extract voice parameters, which drive the resulting instrument sound. A small database contains real instrument samples that have been spectrally analyzed offline. Two different prototypes are introduced, reproducing a trumpet and a bass guitar respectively.},
}

@inproceedings{Janer:2005a,
	Address = {Barcelona},
	Author = {Janer, J.},
	Booktitle = {Proceedings of 118th Audio Engineering Society Convention},
	Title = {Feature Extraction for Voice-driven Synthesis},
	Url = {http://www.iua.upf.edu/mtg/publications/9cf4b9-AES2005-jjaner.pdf},
	Year = {2005},
	Abstract = {This paper explores the singing voice from an unusual perspective, not as a musical instrument but as a musical controller. A set of spectral processing algorithms extract features from the input voice. These features are categorized in four groups: excitation, vocal tract, voice quality, and context. The extracted values are then transmitted as Open Sound Control (OSC) messages to be used in an external synthesis engine. In this paper, we provide first a technical description of the algorithms, and in a second part, we detail the components of the system. A practical example of voice-driven synthesis using PureData (Pd) is also presented.},
}

@inproceedings{MTGMaes2005:319,
	Address = {Barcelona},
	Author = {Maestre, E. and G{\'o}mez, E.},
	Booktitle = {Proceedings of 118th Audio Engineering Society Convention},
	Title = {Automatic characterization of dynamics and articulation of expressive monophonic recordings},
	Url = {http://www.iua.upf.edu/mtg/publications/07fa41-AES118-MaestreGomez.pdf},
	Year = {2005},
	Abstract = {We describe a method to automatically extract a set of features from the audio signal that are related to musical expressivity, more concretely to dynamics and articulation. We define a description scheme based on intra-note segmentation into attack, sustain, release and transition segments, and a subsequent amplitude and pitch contour characterization. Then, we present a series of algorithms to automatically perform intra-note segmentation and extract some features related to expressivity. We evaluate the performance of the methods for intra-note segmentation and feature extraction over a saxophone database of jazz standards and other recordings presenting expressive resources. Finally, we propose some future work and applications.},
}

@inproceedings{Loscos:2005,
	Address = {Vancouver, Canada},
	Author = {Loscos, A. and Aussenac, T.},
	Booktitle = {Proceedings of {N}ew {I}nterfaces for {M}usical {E}xpression ({NIME})},
	Title = {The {W}ahwactor: a voice controlled wah-wah pedal},
	Url = {http://www.iua.upf.edu/mtg/publications/26e7c9-NIME05-aloscos.pdf},
	Year = {2005},
	Abstract = {Using a wah-wah pedal guitar is something guitar players have to learn. Recently, more intuitive ways to control such effect have been proposed. In this direction, the Wahwactor system controls a wah-wah transformation in real-time using the guitar player's voice, more precisely, using the performer [wa-wa] utterances. To come up with this system, different vocal features derived from spectral analysis have been studied as candidates for being used as control parameters. This paper details the results of the study and presents the implementation of the whole system.},
}

@inproceedings{MTGLosc2006:407,
	Address = {San Francisco, CA, USA},
	Author = {Loscos, A. and Bonada, J.},
	Booktitle = {Proceedings of 121st Convention of the Audio Engineering Society},
	Title = {Esophageal Voice Enhancement by Modeling Radiated Pulses in Frequency Domain},
	Url = {http://www.iua.upf.edu/mtg/publications/9d0455-AES121-aloscos-jonada.pdf},
	Year = {2006},
	Abstract = {Altough esophageal speech has demonstrated to be the most popular voice recovering method after laryngectomy surgery, it is difficult to master and shows a poor degree of intelligibility. This article proposes a new method for esophageal voice enhancement using speech digital signal processing techniques based on modeling radiated voice pulses in frequency domain. The analysis-transformation-synthesis technique creates a non-pathological spectrum for those utterances featured as voiced and filters those unvoiced. Healthy spectrum generation implies transforming the original timbre, modeling harmonic phase coupling from the spectral shape envelope, and deriving pitch from frame energy analysis. Resynthesized speech aims to improve intelligibility, minimize artificial artifacts, and acquire resemblance to patient's pre-surgery original voice.},
}

@inproceedings{Maestre:2006,
	Address = {San Francisco, CA, USA},
	Author = {Maestre, E. and Bonada, J. and Mayor, O.},
	Booktitle = {Proceedings of 121st Convention of the Audio Engineering Society},
	Title = {Modeling musical articulation gestures in singing voice performances},
	Url = {http://www.iua.upf.edu/mtg/publications/53f6ad-AES121-MaestreBonadaMayor.pdf},
	Year = {2006},
	Abstract = {We present a procedure to automatically describe musical articulation gestures used in singing voice performances. We detail a method to characterize temporal evolution of fundamental frequency and energy contours by a set of piece-wise fitting techniques. Based on this, we propose a meaningful parameterization that allows reconstructing contours from a compact set of parameters at different levels. We test the characterization method by applying it to fundamental frequency contours of manually segmented transitions between adjacent notes, and train several classifiers with manually labeled examples. We show the recognition accuracy for different parameterizations and levels of representation.},
}

@inbook{MTGHerr2006:377,
	Author = {Herrera, P. and Klapuri, A. and Davy, M.},
	Booktitle = {Signal Processing Methods for Music Transcription},
	Editor = {A. Klapuri, M. Davy},
	Publisher = {Springer},
	Title = {Automatic Classification of Pitched Musical Instrument Sounds},
	Year = {2006},
	Abstract = {In this chapter we first discuss the methodological elements required for developing systems for the automatic classification of sounds. We then introduce a number of acoustic features that are useful for describing the sounds of pitched musical instruments, some techniques to further distil the information they provide (i.e., transformation and projection), and some strategies for selecting the best features when large sets of them are available. Several techniques for automatic classification, complementing those explained in other chapters of the book, are presented in another specific section. Once the reader has been introduced to all the tools this problem calls for, we present a review of the different systems and approaches that have been proposed for the automatic classification of instruments in isolated sounds, solo phrases, duets, and polyphonic audio. Conclusions and topics for future research compose the closing section of the chapter.}}

@inproceedings{Janer:2005,
	Author = {Janer, J.},
	Booktitle = {Proceedings of {N}ew {I}nterfaces for {M}usical {E}xpression ({NIME})},
	Pages = {132--135},
	Publisher = {National University of Singapore Singapore, Singapore},
	Title = {Voice-controlled plucked bass guitar through two synthesis techniques},
	Url = {http://hct.ece.ubc.ca/nime/2005/proc/nime2005_132.pdf},
	Year = {2005},
	Abstract = {In this paper we present an example of the use of the singing voice as a controller for digital music synthesis. The analysis of the voice with spectral processing techniques, derived from the Short-Time Fourier Transform, provides ways of determining a performer's vocal intentions. We demonstrate a prototype, in which the extracted vocal features drive the synthesis of a plucked bass guitar. The sound synthesis stage 
includes two different synthesis techniques, Physical Models and Spectral Morph.
},
}

@inproceedings{mccartney1996snr,
	Author = {McCartney, J.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'96)},
	Pages = {257--258},
	Title = {{S}uper{C}ollider: A New Real Time Synthesis Language},
	Year = {1996}}

@inbook{Fujimura:1999,
	Author = {Fujimura, O. and Erickson, D.},
	Chapter = {Acoustic Phonetics},
	Editor = {Hardcastle, W.J. and Laver, J.},
	Keywords = {phonetics, trill},
	Pages = {65--115},
	Publisher = {Blackwell},
	Series = {Blackwell Handbooks in Linguistics},
	Title = {Acoustic Phonetics},
	Year = {1999},

}

@book{Ladefoged:1997,
	Author = {Ladefoged, P. and Maddieson, I.},
	Keywords = {phonetics, trill},
	Publisher = {Blackwell},
	Title = {The Sounds of the World's Languages},
	Year = {1997},
}

@book{Clark:1997,
	Author = {Clark, J. and Yallop, C.},
	Edition = {2nd},
	Keywords = {phonetics, trill},
	Publisher = {Blackwell},
	Series = {Blackwell Textbooks in Linguistics},
	Title = {An Introduction to Phonetics and Phonology},
	Year = {1997},
}

@book{Fry:1996,
	Author = {Fry, D. B.},
	Keywords = {phonetics},
	Publisher = {Cambridge University Press},
	Series = {Cambridge Textbooks in Linguistics},
	Title = {The Physics of Speech},
	Year = {1996},
}

@article{Blaschke:2007,
	Author = {Blaschke, T., Zito T. and Wiskott, L.},
	Journal = {Neural Computation},
	Keywords = {independent component analysis, slow feature analysis},
	Title = {Independent Slow Feature Analysis and Nonlinear Blind Source Separation},
	Url = {http://itb1.biologie.hu-berlin.de/~wiskott/Publications/BlasZitoWisk2007-ISFA-NeurComp.pdf},
	Year = {2007},
	Abstract = {In the linear case statistical independence is a sufficient criterion for performing blind source separa- 
tion. In the nonlinear case, however, it leaves an ambiguity in the solutions that has to be resolved by 
additional criteria. Here we argue that temporal slowness complements statistical independence well and 
that a combination of the two leads to unique solutions of the nonlinear blind source separation problem. 
The algorithm we present is a combination of second-order Independent Component Analysis and Slow 
Feature Analysis and is referred to as Independent Slow Feature Analysis. Its performance is demon- 
strated on nonlinearly mixed music data. We conclude that slowness is indeed a useful complement to 
statistical independence but that time-delayed second-order moments are only a weak measure of statisti- 
cal independence. 
},
}

@article{Aucouturier:2006a,
	Author = {Aucouturier, J.-J. and Pachet, F.},
	Journal = {Journal of the Acoustical Society of America},
	Keywords = {Soundscapes, Music, Similarity, Classification, Homogeneity, Distribution},
	Title = {The Bag-of-frame Approach to Audio Pattern Recognition: Why This Works for Urban Soundscapes and Not For Polyphonic Music},
	Url = {http://jj-aucouturier.info/papers/JASA-2006.pdf},
	Year = {2006},
	Abstract = {The ``bag of frame'' approach (BOF) to audio pattern recognition represents signals as the long- 
term statistical distribution of their local spectral features, a prototypical implementation of which 
being Gaussian Mixture Models of Mel-Frequency Cepstrum Coefficients. This approach has 
proved nearly optimal for modelling the perception of environmental audio textures, and is also 
the most predominent paradigm to extract high-level descriptions from music signals, such as 
their instrument, genre or mood. However, recent studies show that, contrary to its application 
to environmental signals, BOF only provides limited performance when applied to polyphonic 
music signals. This paper proposes to explicitely examine the difference between soundscape and 
polyphonic music signals with respect to their modelling with the BOF approach. We describe a 
algorithmic measure of acoustic similarity between sound textures, which is a typical instanciation 
of the BOF approach, and compare its results on urban soundscapes and polyphonic music. We 
report near perfect results for soundscapes, and notably observe none of the limitations occurring 
with the music dataset. Moreover, we study the effect on the measure of 2 custom homogeneity 
transforms. We observe critical differences in the temporal and statistical structure of the typical 
frame distribution for each type of signal, and propose that they explain the uneven performance 
of their respective modelling with the BOF approach. },
}

@inproceedings{Aucouturier:2002,
	Author = {Aucouturier, J.-J. and Sandler, M.},
	Booktitle = {Proc AES 22nd Int Conf Virtual, Synthetic and Entertainment Audio},
	Title = {Finding repeating patterns in acoustical musical signals},
	Url = {http://www.jj-aucouturier.info/papers/AES-2002.pdf},
	Year = {2002},
	Abstract = {Finding structure and repetitions in a musical signal is crucial to enable interactive browsing into large databases of music 
files. Notably, it is useful to produce short summaries of musical pieces, or ''audio thumbnails''. In this paper, we propose 
an algorithm to find repeating patterns in an acoustic musical signal. We first segment the signal into a meaningful 
succession of timbres. This gives a reduced string representation of the music, the texture score, which doesn't encode 
any pitch information. We then look for patterns in this representation, using two techniques from image processing: 
Kernel Convolution and Hough Transform. The resulting patterns are relevant to musical structure, which shows that 
pitch is not the only useful representation for the structural analysis of polyphonic music. 
},
}

@phdthesis{Collins:2006,
	Author = {Collins, N.},
	School = {University of Cambridge},
	Title = {Towards Autonomous Agents for Live Computer Music: Realtime Machine Listening and Interactive Music Systems},
	Year = {2006},
	Abstract = {Musical agents which can interact with human musicians in concert situations are a reality, though the extent to which they themselves embody human-like capabilities can be called into question. They are perhaps most correctly viewed, given their level of artificial intelligence technology, as `pro jected intelligences', a composer's anticipation of the dynamics of a concert setting made manifest in programming code. This thesis will describe a set of interactive systems developed for a range of musical styles and instruments, all of which attempt to participate in a concert by means of audio signal analysis alone. Machine listening, being the simulation of human peripheral auditory abilities, and the hypothetical modelling of central auditory and cognitive processes, is utilised in these systems to track musical activity. Whereas much of this modelling is inspired by a bid to emulate human abilities, strategies diverging from plausible human physiological mechanisms are often employed, leading to machine capabilities which exceed or differ from the human counterparts. Technology is described which detects events from an audio stream, further analysing the discovered events (typically notes) for perceptual features of loudness, pitch, attack time and timbre. In order to exploit processes that underlie common musical practice, beat tracking is investigated, allowing the inference of metrical structure which can act as a co-ordinative framework for interaction. Psychological experiments into human judgement of perceptual attack time and beat tracking to ecologically valid stimuli clarify the parameters and constructs that should most appropriately be instantiated in the computational systems. All the technology produced is intended for the demanding environment of realtime concert use. In particular, an algorithmic audio splicing and analysis library called BBCut2 is described, designed with appropriate processing and scheduling faculties for realtime operation. Proceeding to outlines of compositional applications, novel interactive music systems are introduced which have been tested in real concerts. These are evaluated by interviews with the musicians who performed with them, and an assessment of their claims to agency in the sense of `autonomous agents'. The thesis closes by considering all that has been built, and the possibilities for future advances allied to artificial intelligence and signal processing technology.},
}

@inproceedings{kapanci2004hao,
	Author = {Kapanci, E. and Pfeffer, A.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'04)},
	Pages = {438--441},
	Title = {A hierarchical approach to onset detection},
	Url = {http://www.eecs.harvard.edu/~avi/Papers/onset13.pdf},
	Year = {2004},
	Abstract = {Onset detection in vocal music and many other instruments is complicated by the possibility of soft transitions between notes. Most systems try to identify onsets within a short-time window as it is easier to define transition functions over a restricted space. However, it may not be possible to detect soft onsets without considering a long-time window, for which defining and computing the transition function can be hard and computationally costly. We present a method which looks for onsets between locations of increasing distance and is able to capture such onsets without considering all the points within the window. For the onset identification function we use both a simple manual function and support vector machines trained using a labelled corpus. },
}

@article{Aucouturier:2004,
	Author = {Aucouturier, J.-J. and Pachet, F.},
	Journal = {Journal of Negative Results in Speech and Audio Sciences},
	Keywords = {negative results, MFCC},
	Number = {1},
	Title = {Improving Timbre Similarity: How high is the sky?},
	Url = {http://www.csl.sony.fr/downloads/papers/uploads/aucouturier-04b.pdf},
	Volume = {1},
	Year = {2004},
	Abstract = {We report on experiments done in an attempt 
to improve the performance of a music similarity mea- 
sure which we introduced in [2]. The technique aims at 
comparing music titles on the basis of their global ``tim- 
bre'', which has many applications in the field of Music 
Information Retrieval. Such measures of timbre similar- 
ity have seen a growing interest lately, and every contri- 
bution (including ours) is yet another instantiation of the 
same basic pattern recognition architecture, only with dif- 
ferent algorithm variants and parameters. Most give en- 
couraging results with a little effort, and imply that near- 
perfect results would just extrapolate by fine-tuning the 
algorithms' parameters. However, such systematic testing 
over large, inter-dependent parameter spaces is both diffi- 
cult and costly, as it requires to work on a whole general 
meta-database architecture. This paper contributes in two 
ways to the current state of the art. We report on exten- 
sive tests over very many parameters and algorithmic vari- 
ants, either already envisioned in the literature or not. This 
leads to an improvement over existing algorithms of about 
15% R-precision. But most importantly, we describe many 
variants that surprisingly do not lead to any substancial 
improvement. Moreover, our simulations suggest the exis- 
tence of a ``glass ceiling'' at R-precision about 65% which 
cannot probably be overcome by pursuing such variations 
on the same theme. },
}

@inproceedings{Pachet:2003,
	Author = {Pachet, F. and Zils, A.},
	Booktitle = {1st International Symposium on Computer Music Modeling and Retrieval (CMMR2003)},
	Keywords = {genetic algorithms, genetic programming},
	Publisher = {Springer Verlag},
	Title = {Evolving automatically high-level music descriptors from acoustic signals},
	Url = {http://www.csl.sony.fr/downloads/papers/uploads/pachet-03h.pdf},
	Year = {2003},
	Abstract = {High-Level music descriptors are key ingredients for music information retrieval systems. Although there is a long tradition in extracting information from acoustic signals, the field of music information extraction is largely heuristic in nature. We present here a heuristic-based generic approach for extracting automatically high-level music descriptors from acoustic signals. This approach is based on Genetic Programming, that is used to build extraction functions as compositions of basic mathematical and signal processing operators. The search is guided by specialized heuristics that embody knowledge about the signal processing functions built by the system. Signal processing patterns are used in order to control the general function extraction methods. Rewriting rules are introduced to simplify overly complex expressions. In addition, a caching system further reduces the computing cost of each cycle. In this paper, we describe the overall system and compare its results against traditional approaches in musical feature extraction {\`a} la Mpeg7. 
},
}

@misc{Pachet:2004,
	Author = {Pachet, F. and Zils, A.},
	Booktitle = {LECTURE NOTES IN COMPUTER SCIENCE},
	Isi = {ISI:000189495400005},
	Journal = {COMPUTER MUSIC MODELING AND RETRIEVAL},
	Keywords = {genetic algorithms, genetic programming},
	Pages = {42--53},
	Publication-Type = {S},
	Title = {Evolving automatically high-level music descriptors from acoustic signals},
	Volume = {2771},
	Year = {2004},
	Abstract = {High-Level music descriptors are key ingredients for music information retrieval systems. Although there is a long tradition in extracting information from acoustic signals, the field of music information extraction is largely heuristic in nature. We present here a heuristic-based generic approach for extracting automatically high-level music descriptors from acoustic signals. This approach is based on Genetic Programming, that is used to build extraction functions as compositions of basic mathematical and signal processing operators. The search is guided by specialized heuristics that embody knowledge about the signal processing functions built by the system. Signal processing patterns are used in order to control the general function extraction methods. Rewriting rules are introduced to simplify overly complex expressions. In addition, a caching system further reduces the computing cost of each cycle. In this paper, we describe the overall system and compare its results against traditional approaches in musical feature extraction la Mpeg7.},
}

@article{Aucouturier:2005,
	Author = {Aucouturier, J.-J. and Pachet, F. and Sandler, M.},
	Date = {DEC},
	Isi = {ISI:000233471200004},
	Journal = {IEEE TRANSACTIONS ON MULTIMEDIA},
	Month = {Dec},
	Number = {6},
	Pages = {1028--1035},
	Title = {{``The way it sounds''}: Timbre models for analysis and retrieval of music signals},
	Url = {http://ieeexplore.ieee.org/iel5/6046/32935/01542080.pdf?tp=&arnumber=1542080&isnumber=32935},
	Volume = {7},
	Year = {2005},
	Abstract = {Electronic Music Distribution is in need of robust and automatically extracted music descriptors. An important attribute of a piece of polyphonic music is what is commonly referred to as 'the way it sounds'. While there has been a large quantity of research done to model the timbre of individual instruments, little work has been done to analyze 'real world' timbre mixtures such as the ones found in popular music. In this paper, we present our research about such 'polyphonic timbres'. We describe an effective way to model the textures found in a given music signal, and show that such timbre models provide new solutions to many issues traditionally encountered in music signal processing and music information retrieval. Notably, we describe their applications for music similarity, segmentation and pattern induction.},
}

@article{Aucouturier:2006,
	Author = {Aucouturier, J. J. and Pachet, F.},
	Date = {MAR},
	Isi = {ISI:000238193200004},
	Journal = {JOURNAL OF NEW MUSIC RESEARCH},
	Month = {Mar},
	Number = {1},
	Pages = {35--50},
	Title = {Jamming with plunderphonics: Interactive concatenative synthesis of music},
	Url = {http://www.csl.sony.fr/downloads/papers/2005/aucouturier-05c.pdf},
	Volume = {35},
	Year = {2006},
	Abstract = {This paper proposes to use the techniques of Concatenative Sound Synthesis in the context of real-time Music Interaction. We describe a system that generates an audio track by concatenating audio segments extracted from pre-existing musical files. The track can be controlled in real-time by specifying high-level properties (or constraints) holding on metadata about the audio segments. A constraint-satisfaction mechanism, based on local search, selects audio segments that best match those constraints at any time. We describe the real-time aspects of the system, notably the asynchronous adding/removing of constraints, and report on several constraints and controllers designed for the system. We illustrate the system with several application examples, notably a virtual drummer able to interact with a human musician in real-time.},
}

@article{Paraskevas:2004,
	Author = {Paraskevas, I. and Chilton, E.},
	Doi = {10.1121/1.1755731},
	Journal = {Acoustics Research Letters Online},
	Month = {July},
	Number = {3},
	Pages = {111-117},
	Title = {Combination of magnitude and phase statistical features for audio classification},
	Url = {http://scitation.aip.org/getpdf/servlet/GetPDFServlet?filetype=pdf&id=ARLOFJ000005000003000111000001&idtype=cvips&prog=normal},
	Volume = {5},
	Year = {2004},
	Abstract = {The increasing demand for the retrieval and classification of audio utterances from multimedia databases, gives rise to the need for the implementation of effective feature extraction techniques. Most recent techniques employ temporal-related features and magnitude spectral features. In the proposed method, we use both the magnitude and phase spectrum of the signals to derive the features. By overcoming the discontinuity problems of phase, phase may be used as an additional feature stream. The experimental results derived from ten classes of gunshots show that, for certain classes, there is an improvement of 14% when both magnitude and phase information is employed, compared to the case when only the magnitude feature vector is used. Also, the results reported here show that the reliability of the method is increased, demonstrating the complementary nature of magnitude and phase. {\copyright}2004 Acoustical Society of America.},
}

@article{cemgil00jnmr,
	Author = {Cemgil, A. T. and Kappen, B. and Desain, P. and Honing, H.},
	Journal = {Journal Of New Music Research},
	Keywords = {tempo tracking},
	Number = {4},
	Pages = {259-273},
	Title = {On tempo tracking: Tempogram Representation and {K}alman Filtering},
	Volume = {29},
	Year = {2000},
	Abstract = {We formulate tempo tracking in a Bayesian framework where a tempo tracker is modeled as a stochastic dynamical system. The tempo is modeled as a hidden state variable of the system and is estimated by a Kalman filter. The Kalman filter operates on a Tempogram, a wavelet-like multiscale expansion of a real performance. An important advantage of our approach is that it is possible to formulate both off-line or real-time algorithms. The simulation results on a systematically collected set of MIDI piano performances of Yesterday and Michelle by the Beatles shows accurate tracking of approximately of the beats. 
},
}

@inproceedings{dannenberg90icmc,
	Address = {Hong Kong},
	Author = {Allen, P. E. and Dannenberg, R. B.},
	Booktitle = {Proc. International Computer Music Conference (ICMC'90)},
	Pages = {140--143},
	Title = {Tracking Musical Beats in Real Time},
	Year = {1990},
	Abstract = {Identifying the temporal location of downbeats is a fundamental musical skill. After briefly discussing the perceptual information available in beat tracking, we survey previous attempts to automate this process in both real time and non-real time. Our attempt to add flexibility to Mont-Reynaud's model [4] by parameterizing the confidence and history mechanisms failed to yield a satisfactory beat tracker. Observing that this and previous models are constrained to hold a single current notion of beat timing and placement, we find that they will fail to predict beats and not recover beyond the point at which a mistake is first made. We propose a new model that uses beam search [1] to simultaneously consider multiple interpretations of the performance. At any time, predictions of beat timing and placement can be made according to the most credible of many interpretations under consideration. Credibility is determined by a heuristic evaluation function.},
}

@inproceedings{leveau04ismir,
	Address = {Barcelona, Spain},
	Author = {Leveau, P. and Daudet, L. and Richard, G.},
	Booktitle = {Proceedings of 5th International Symposium on Music Information Retrieval},
	Pages = {72--75},
	Title = {Methodology and tools for the evaluation of automatic onset detection algorithms in music},
	Year = {2004},
}

@book{bentley:1999,
	Author = {Bentley, P.J.},
	Editor = {Bentley, P.J.},
	Isbn = {978-1558606050},
	Keywords = {genetic algorithms, genetic programming, evolutionary computation},
	Month = {May},
	Publisher = {Harcourt International},
	Title = {Evolutionary Design by Computers},
	Year = {1999}}

@inproceedings{ramirez:evows05,
	Address = {Lausanne, Switzerland},
	Author = {Ramirez, R. and Hazan, A.},
	Booktitle = {Applications of Evolutionary Computing, EvoWorkshops2005: {EvoBIO}, {EvoCOMNET}, {EvoHOT}, {EvoIASP}, {EvoMUSART}, {EvoSTOC}},
	Editor = {Rothlauf, Franz and Branke, Juergen and Cagnoni, Stefano and Corne, David W. and Drechsler, Rolf and Jin, Yaochu and Machado, Penousal and Marchiori, Elena and Romero, Juan and Smith, George D. and Squillero, Giovanni},
	Isbn = {3-540-25396-3},
	Issn = {0302-9743},
	Keywords = {evolutionary computation},
	Month = {30 March-1 April},
	Pages = {508--516},
	Publisher = {Springer Verlag},
	Publisher_Address = {Berlin},
	Series = {LNCS},
	Title = {Understanding Expressive Music Performance Using Genetic Algorithms.},
	Volume = {3449},
	Year = {2005},
	Abstract = {We describe an approach to learning expressive performance rules from monophonic Jazz standards recordings by a skilled saxophonist. We use a melodic transcription system which extracts a set of acoustic features from the recordings producing a melodic representation of the expressive performance played by the musician. We apply genetic algorithms to this representation in order to induce rules of expressive music performance. The rules collected during different runs of our system are of musical interest and have a good prediction accuracy.}}

@inproceedings{Hazan:2005c,
	Author = {Hazan, A.},
	Booktitle = {Proceedings of the 118th Audio Engineering Society Convention (AES 118)},
	Keywords = {beatboxing, beatbox},
	Month = {May},
	Title = {BillaBoop: Real-Time Voice-Driven Drum Generator},
	Year = {2005},
	Abstract = {A real-time application for generating drum rhythms controlled by voice is presented. By expressive drum rhythms we refer to a sequence of drum sounds that can be a subset of a samples bank or that can be generated by different drum synthesizers. The system consists of: (a) a descriptors generation component that computes a set of temporal and spectral features from each incoming frame, (b) a multi-band onset detection component based on spectral variations of the incoming stream, (c) a machine learning component which assigns to each of the vocal hits of the input stream a label. Both supervised and unsupervised approaches are considered for the learning task. The last component is (d) a beat generator that generates an output rhythmic stream taking into account continuous expressive features of the vocal performance. This work can be seen as a preliminary step in order the build a robust interface able to process a wide range of real-world signals. Indeed, several vocal onomatopoeias can correspond to the same drum label, depending of the playing style of a given performer. Thus we considered a wide range of oral percussive signals from different performers in the perspective of building a model of immediate use, whithout prior learning step. All these components are integrated into a low-latency application that allows its use for live performances.},
}

@inproceedings{Hazan:2005b,
	Author = {Ramirez, R. and Hazan, A.},
	Booktitle = {Proceedings of the 118th Audio Engineering Society Convention (AES 118)},
	Month = {May},
	Title = {An Approach to Expressive Music Performance Modeling},
	Url = {file:///Volumes/CDM00/Documents/AES-CONV-118/Files/AES118-000298.pdf},
	Year = {2005},
	Abstract = {We describe an approach for generating expressive music performances of monophonic Jazz melodies. The system consists of a melodic transcription component which extracts a set of acoustic features from monophonic recordings, a machine learning component which induce an expressive transformation model from the extracted acoustic features, and a melody synthesis component which generates expressive monophonic output from inexpressive melody descriptions using the induced expressive transformation model.},
}

@article{childers1977cgp,
	Author = {Childers, DG and Skinner, DP and Kemerait, RC},
	Journal = {Proceedings of the IEEE},
	Number = {10},
	Pages = {1428--1443},
	Title = {{The cepstrum: A guide to processing}},
	Url = {http://ieeexplore.ieee.org/iel5/5/31258/01455016.pdf?isnumber=&arnumber=1455016},
	Volume = {65},
	Year = {1977},
	Abstract = {This paper is a pragmatic tutorial review of the cepstrum literature focusing on data processing. The power, complex, and phase cepstra are shown to be easily related to one another. Problems associated with phase unwrapping, linear phase components, spectrum notching, aliasing, oversampling, and extending the data sequence with zeros are discussed. The advantages and disadvantages of windowing the sampled data sequence, the log spectrum, and the complex cepstrum are presented. The influence of noise upon the data processing procedures is discussed throughout the paper, but is not thoroughly analyzed. The effects of various forms of liftering the cepstrum are described. The results obtained by applying whitening and trend removal techniques to the spectrum prior to the calculation of the cepstrum are discussed. We have attempted to synthesize the results, procedures, and information peculiar to the many fields that are finding cepstrum analysis useful. In particular we discuss the interpretation and processing of data in such areas as speech, seismology, and hydroacoustics. But we must caution the reader that the paper is heavily influenced by our own experiences; specific procedures that have been found useful in one field should not be considered as totally general to other fields. It is hoped that this review will be of value to those familiar with the field and reduce the time required for those wishing to become so.},
}

@article{degroeve2005sao,
	Author = {Degroeve, S. and Tanghe, K. and De Baets, B. and Leman, M. and Martens, J.P.},
	Journal = {Proceedings of ISMIR},
	Keywords = {drum classification, Mel Frequency Cepstral Coefficients, Support Vector Machine, Simulated Annealing},
	Title = {A Simulated Annealing Optimization of Audio Features for Drum Classification},
	Url = {http://www.ipem.ugent.be/MAMI/Public/Papers/ISMIR2005_DrumDetectionOptimization_Degroeve.pdf},
	Year = {2005},
	Abstract = {Current methods for the accurate recognition of instruments within music are based on discriminative data descriptors. These are features of the music fragment that capture the characteristics of the audio and suppress details that are redundant for the problem at hand. The extraction of such features from an audio signal requires the user to set certain parameters. We propose a method for optimizing the parameters for a particular task on the basis of the Simulated Annealing algorithm and Support Vector Machine classification. We show that using an optimized set of audio features improves the recognition accuracy of drum sounds in music fragments. },
}

@inproceedings{LeSaffre:2003,
	Address = {Hannover, Germany},
	Author = {LeSaffre, M. and Moelants, D. and Leman, M. and De Baets, B. and De Meyer, H. and Martens, G. and Martens, J.-P.},
	Booktitle = {Proceedings of the 5th Triennial ESCOM Conference},
	Pages = {208--211},
	Title = {User behavior in the spontaneous reproduction of musical pieces by vocal query},
	Url = {http://www.epos.uos.de/music/books/k/klww003/pdfs/176_Lesaffre_Proc.pdf},
	Year = {2003},
	Abstract = {Background:

This experiment is part of a broader research project in the field of Musical Information Retrieval. In order to realize a user-friendly system for searching musical pieces by vocal query, the behavior of subjects asked to imitate well-known songs from long-term memory and unfamiliar songs after a single hearing was investigated.

Aims: Our aim is to analyze the characteristics of the behavior of people who reproduce a piece of music from memory in an intuitiveway.This should lead to a view of preferences for certain methods of vocal query.

Method: 72 subjects participated in an experiment in which they were asked to reproduce pieces of music in front of a microphone. No further restrictions were given. In the first part of the experiment subjects responded to titles of pieces they previously indicated as familiar. In the second part entire pieces of music, indicated as unfamiliar, were aurally presented before asking reproduction.

Results: In general, participants asked to reproduce music prefer a melodic use of the text or of specific syllables. Significant effects of gender and musical background were found as well as differences between the reproduction of unfamiliar melodies and the recall of known melodies. Clear relations between user behavior and musical content were found.

Conclusions: User preferences and general characteristics of vocal queries aimed at searching specific pieces in a music database are established. The findings generate some guidelines for the development of user-friendly systems for musical information retrieval based on vocal queries.
},
}

@article{Pollard:1982,
	Author = {Pollard, H. F. and Jansson, E. V.},
	Journal = {Acustica},
	Pages = {162--171},
	Title = {A Tristimulus Method for the Specification of Musical Timbre},
	Volume = {51},
	Year = {1982},
}

@phdthesis{Jensen:1999,
	Author = {Jensen, K.},
	School = {University of Copenhagen, Denmark},
	Title = {Timbre Models of Musical Sounds},
	Url = {http://www.aaue.dk/~krist/TMoMS.pdf},
	Year = {1999},
	Abstract = {This work involves the analysis of musical instrument sounds, the creation of timbre models, the estimation of the parameters of the timbre models and the analysis of the timbre model parameters.

The timbre models are found by studying the literature of auditory perception, and by studying the gestures of music performance.

Some of the important results from this work are an improved fundamental frequency estimator, a new envelope analysis method, and simple intuitive models for the sound of musical instruments. Furthermore a model for the spectral envelope is introduced in this work. A new function, the brightness creation function, is introduced in the spectral envelope model.

The timbre model is used to analyze the evolution of the different timbre parameters when the fundamental frequency is changed, but also for different intensity, tempo, or style. The main results from this analysis are that brightness rises with frequency, but nevertheless the fundamental has almost all amplitude for the high notes. The attack and release times generally fall with frequency. It was found that only brightness and amplitude are affected by a change in intensity, and only the sustain and release times are affected when the tempo is changed.

The different timbre models are also used for the classification of the sounds in musical instrument classes with very good results. Finally, listening tests have been performed, which assessed that the best timbre model has an acceptable sound quality.},
}

@article{1062166,
	Address = {Amsterdam, The Netherlands, The Netherlands},
	Author = {Hermus, Kris and Verhelst, Werner and Lemmerling, Philippe and Wambacq, Patrick and Huffel, Sabine Van},
	Doi = {http://dx.doi.org/10.1016/j.sigpro.2004.09.010},
	Issn = {0165-1684},
	Journal = {Signal Processing},
	Number = {1},
	Pages = {163--176},
	Publisher = {Elsevier North-Holland, Inc.},
	Title = {Perceptual audio modeling with exponentially damped sinusoids},
	Volume = {85},
	Year = {2005},
	Abstract = {This paper presents the derivation of a new perceptual model that represents speech and audio signals by a sum of exponentially damped sinusoids. Compared to a traditional sinusoidal model, the exponential sinusoidal model (ESM) is better suited to model transient segments that are readily found in audio signals.Total least squares (TLS) algorithms are applied for the automatic extraction of the modeling parameters in the ESM, i.e. the amplitude, phase, frequency and damping factors of a user-defined number of damped sinusoids. In order to turn the SNR optimization criterion of these TLS algorithms into a perceptual modeling strategy, we use the psychoacoustic model of MPEG-1 Layer 1 in a subband TLS-ESM scheme. This allows us to model each subband signal in accordance with its perceptual relevance, thereby lowering the number of required modeling components for a given modeling quality. Simulations and listening tests confirm that perceptual ESM achieves the same perceived quality as plain ESM while using substantially less components, and provide support for applying the new model in the fields of parametric audio processing and coding.},
}

@article{Kohonen:1990,
	Author = {Kohonen, T.},
	Isbn = {0018-9219},
	Ja = {Proceedings of the IEEE},
	Journal = {Proceedings of the IEEE},
	Keywords = {learning systems, neural nets, self-adjusting systems, speech recognition, clustering, competitive learning, learning vector, neural networks, self-organizing map, semantic maps, speech recognition},
	Number = {9},
	Pages = {1464--1480},
	Title = {The self-organizing map},
	Ty = {JOUR},
	Url = {http://ieeexplore.ieee.org/iel1/5/2115/00058325.pdf?isnumber=2115&arnumber=58325},
	Volume = {78},
	Year = {1990},
	Abstract = {The self-organized map, an architecture suggested for artificial neural networks, is explained by presenting simulation experiments and practical applications. The self-organizing map has the property of effectively creating spatially organized internal representations of various features of input signals and their abstractions. One result of this is that the self-organization process can discover semantic relationships in sentences. Brain maps, semantic maps, and early work on competitive learning are reviewed. The self-organizing map algorithm (an algorithm which order responses spatially) is reviewed, focusing on best matching cell selection and adaptation of the weight vectors. Suggestions for applying the self-organizing map algorithm, demonstrations of the ordering process, and an example of hierarchical clustering of data are presented. Fine tuning the map by learning vector quantization is addressed. The use of self-organized maps in practical speech recognition and a simulation experiment on semantic mapping are discussed},
}

@inproceedings{Ganchev:2005,
	Author = {Ganchev, T. and Fakotakis, N. and Kokkinakis, G.},
	Booktitle = {10th International Conference on Speech and Computer, SPECOM 2005},
	Keywords = {MFCC},
	Pages = {191--194},
	Title = {Comparative Evaluation of Various {MFCC} Implementations on the Speaker Verification Task},
	Url = {http://www.wcl.ee.upatras.gr/ai/papers/ganchev17.pdf},
	Volume = {1},
	Year = {2005},
	Abstract = {Making no claim of being exhaustive, a review of the most popular MFCC (Mel Frequency Cepstral Coefficients) implementations is made. These differ mainly in the particular approximation of the nonlinear pitch perception of human, the filter bank design, and the compression of the filter bank output. Then, a comparative evaluation of the presented implementations is performed on the task of text-independent speaker verification, by means of the well-known 2001 NIST SRE (speaker recognition evaluation) one-speaker detection database. },
}

@inproceedings{Lacoste:2005,
	Author = {Lacoste, A.},
	Booktitle = {MIREX 2005},
	Keywords = {Onset Detection, Neural Networks, Machine Learning},
	Title = {ONSET DETECTION WITH ARTIFICIAL NEURAL NETWORKS FOR {MIREX} 2005},
	Url = {http://www.music-ir.org/evaluation/mirex-results/articles/onset/lacoste.pdf},
	Year = {2005},
	Abstract = {In this document, we describe two onset detection algorithms that have participated to the Mirex 2005 onset detection contest. The proposed algorithms, first classify frames of a spectrogram into onset or non-onset, using a feedforward neural network. From the classification of each frames, it extractthe onsettimes, using a simple peak picking algorithm, based on a moving average. },
}

@inproceedings{Nilsson:2000,
	Author = {Nilsson, M. and Andersen, S. V. and Kleijn, W. B.},
	Booktitle = {Acoustics, Speech, and Signal Processing, 2000. ICASSP '00. Proceedings. 2000 IEEE International Conference on},
	Ja = {Acoustics, Speech, and Signal Processing, 2000. ICASSP '00. Proceedings. 2000 IEEE International Conference on},
	Journal = {Acoustics, Speech, and Signal Processing, 2000. ICASSP '00. Proceedings. 2000 IEEE International Conference on},
	Keywords = {information theory, spectral analysis, speech processing, frequency bands, high frequency band, low frequency bands, lower bound expression, mutual information, spectral envelope},
	Pages = {1327--1330 vol.3},
	Title = {On the mutual information between frequency bands in speech},
	Ty = {CONF},
	Url = {http://ieeexplore.ieee.org/iel5/6939/18687/00861823.pdf?isnumber=&arnumber=861823},
	Volume = {3},
	Year = {2000},
	Abstract = {In this paper we investigate the mutual information in speech between the spectral envelope of the high frequency band and low frequency bands of various widths. Direct methods on the computation of the mutual information often result in an excessive amount of data required even for modest situations. We reduce the required amount of data by quantizing the low band leading to a lower bound expression on the mutual information. We indicate by simulation that this lower bound is in the same order of magnitude as the true mutual information. Simulations on speech show that we have no less than 0.1 bit of shared information between the slope of the high band and the low frequency band from 0-4 kHz. Performing the analogous simulation with the gain of the high band we obtained no less than 0.45 bit of mutual information},
}

@inproceedings{Hazan:2005,
	Address = {New York, NY, USA},
	Author = {Hazan, A.},
	Booktitle = {Proceedings of the 10th international conference on Intelligent User Interfaces (IUI '05)},
	Doi = {10.1145/1040830.1040904},
	Isbn = {1-58113-894-6},
	Location = {San Diego, California, USA},
	Pages = {296--298},
	Publisher = {ACM Press},
	Title = {Towards automatic transcription of expressive oral percussive performances},
	Url = {http://portal.acm.org/ft_gateway.cfm?id=1040904&type=pdf&coll=GUIDE&dl=GUIDE&CFID=8317832&CFTOKEN=10891387},
	Year = {2005},
	Abstract = {We describe a tool for transcribing voice generated percussive rhythms. The system consists of: (a) a segmentation component which separates the monophonic input stream into percussive events (b) a descriptors generation component that computes a set of acoustic features from each of the extracted segments, (c) a machine learning component which assigns to each of the segmented sounds of the input stream a symbolic class. We describe each of these components and compare different machine learning strategies that can be used to obtain a symbolic representation of the oral percussive performance.},
}

@inproceedings{FitzGerald:2002,
	Author = {FitzGerald, D., Coyle E. Lawlor B.},
	Booktitle = {Proceedings of the 5th International Conference on Digital Audio Effects (DAFX'02)},
	Pages = {65-69},
	Title = {Sub-Band Independent Subspace Analysis for Drum Transcription},
	Url = {http://www2.hsu-hh.de/ant/dafx2002/papers/DAFX02_Fitzgerald_Coyle_Lawlor_drum_transcription.pdf},
	Year = {2002},
}

@inproceedings{Richard:2006,
	Author = {Richard, O. Gillet {\&} G.},
	Booktitle = {Proc of 7th International Conference on Music Information Retrieval, ISMIR 2006},
	Title = {ENST-Drums: an extensive audio-visual database for drum signals processing},
	Url = {http://www.enst.fr/~gillet/pdf/ISMIR2006.pdf},
	Year = {2006},
	Abstract = {One of the main bottlenecks in the progress of the Music Information Retrieval (MIR) research field is the limited access to common, large and annotated audio databases that could serve for technology development and/or evaluation. The aim of this paper is to present in detail the ENST-Drums database, emphasizing on both the content and the recording process. This audiovisual database of drum performances by three professional drummers was recorded on 8 audio channels and 2 video channels. The drum sequences are fully annotated and will be, for a large part, freely distributed for research purposes. The large variety in its content should serve research in various domains of audio signal processing involving drums, ranging from single drum event classification to complex multimodal drum track transcription and extraction from polyphonic music.},
}

@article{sillanpaa2000ran,
	Author = {Sillanpaa, J. and Klapuri, A. and Seppanen, J. and Virtanen, T.},
	Journal = {Proceedings of European Signal Processing Conference, EUSIPCO-2000},
	Title = {Recognition of acoustic noise mixtures by combined bottom-up and top-down approach},
	Url = {http://www.cs.tut.fi/sgn/arg/music/drums2000.ps},
	Year = {2000},
	Abstract = {In this paper, a system is described for the recognition of 
mixtures of noise sources in acoustic input signals. The 
problem is approached by utilizing both bottom-up signal 
analysis and top-down predictions of higher-level models. 
The developments are made using musical signals as test 
material.},
}

@inproceedings{Bello:2006,
	Author = {Bello, J.P. and Ravelli, E. and Sandler, M.},
	Booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP'06},
	Keywords = {subbands},
	Pages = {233-236},
	Title = {Drum sound analysis for the manipulation of rhythm in drum loops},
	Url = {http://www.lam.jussieu.fr/src/Membres/Ravelli/icassp06-2/ravelli_icassp06_b.pdf},
	Volume = {5},
	Year = {2006},
	Abstract = {This paper addresses the issue of drum sound classification in the context of automatic rhythm modification of drum loops. The proposed method segments the signal using an onset detection algorithm, characterises segmented sounds using a spectral feature set, and classifies them using k-means clustering. We propose a simple taxonomy for the grouping of different instrumental sounds under a few utilitarian labels. Results demonstrate the adequacy of our proposed taxonomy while showing that our classification approach outperforms commonly-used supervised learning techniques.},
}

@inproceedings{Ravelli:2005,
	Author = {Ravelli, E. and M., Sandler and Bello, J.P.},
	Booktitle = {Proceedings of the 8th Conference on Digital Audio Effects (DAFx-05)},
	Title = {Fast implementation for non-linear time-scaling of stereo signals},
	Url = {http://www.elec.qmul.ac.uk/people/juan/Documents/Ravelli-DAFx-2005.pdf},
	Year = {2005},
	Abstract = {In this paper we present an improved implementation of Duxbury's 
adaptive phase-vocoder approach for audio time-stretching using 
non-linear time-scaling and temporal masked phase locking at tran- 
sients [1]. We show that the previous algorithm has some limita- 
tions, notably its slow implementation and its incapacity to deal 
with stereo signals. We propose solutions to this problems includ- 
ing: an improved transient detection, a much faster implementa- 
tion using the IFFT for re-synthesis and a method for stretching 
stereo signals without artifacts. Finally, we provide some graphical 
results and quantitative measures to illustrate our improvements.},
}

@article{Ravelli:2007,
	Author = {Ravelli, E. and Bello, J.P. and Sandler, M.},
	Journal = {IEEE Signal Processing Letters},
	Month = {April},
	Title = {Automatic rhythm modification of drum loops},
	Url = {http://www.lam.jussieu.fr/src/Membres/Ravelli/spl06/ravelli_spl06.pdf},
	Url2 = {http://www.lam.jussieu.fr/src/Membres/Ravelli/spl06/spl06.html},
	Year = {2007},
	Abstract = {We propose a novel system able to modify the rhythm of a given drum loop, known as the original, to match the rhythmic pattern of a second loop, known as the model. Our approach is fully automated, thus eliminating the need for MIDI sequencing. The presented methodology combines standard and state-of-the-art techniques for the segmentation and classification of drum sounds, the matching of drum sequences and the transformation of the original loop. We discuss the advantages and disadvantages of the proposed approach and provides links to examples for the qualitative evaluation of the system's output.},
}

@article{Gillet:2005,
	Author = {Gillet, O. and Richard, G.},
	Journal = {Journal of Intelligent Information Systems},
	Keywords = {subbands},
	Number = {2},
	Pages = {159--177},
	Publisher = {Springer},
	Title = {Drum Loops Retrieval from Spoken Queries},
	Url = {http://www.springerlink.com/content/m1306l28336gh887/fulltext.pdf},
	Volume = {24},
	Year = {2005},
	Abstract = {Recent efforts in audio indexing and music information retrieval mostly focus on melody. If this is 
appropriate for polyphonic music signals, specific approaches are needed for systems dealing with percussive 
audio signals such as those produced by drums, tabla or djemb  ́e. In this article, we present a complete system 
allowing the management of a drum patterns (or drumloops) database. Queries in this database are formulated with 
spoken onomatopoeias---short meaningless words imitating the different sounds of the drumkit. The transcription 
task necessary to index the database is performed using Hidden Markov Models (HMM) and Support Vector 
Machines (SVM) and achieves a 86.4% correct recognition rate. The syllables of spoken queries are recognized 
and a relevant statistical model allows the comparison and alignment of the query with the rythmic sequences 
stored in the database, in order to provide a set of the most relevant drum loops. 
},
}

@article{Van-Steelant:2004,
	Author = {Van Steelant, D. and Tanghe, K. and Degroeve, S. and De Baets, B. and Leman, M. and Martens, J. P.},
	Journal = {Proceedings of GfKl},
	Title = {Support Vector Machines for Bass and Snare Drum Recognition},
	Url = {http://www.ipem.ugent.be/MAMI/Public/Papers/GfKl2004_SVMForBassAndSnareDrumRecognition.pdf},
	Year = {2004},
	Abstract = {In this paper we attempt to extract information concerning percussive 
instruments from a musical audio signal. High-dimensional vectors of descriptors 
are computed from the signal and classified by means of Support Vector Machines 
(SVM). We investigate the performance on 2 important classes of drum sounds in 
Western popular music: bass and snare drums, possibly overlapping. The results 
are encouraging: SVM achieve a high accuracy and F1 -measure, with linear kernels 
performing (nearly) as good as Gaussian kernels, but requiring 1000 times less 
computation time. 
},
}

@inproceedings{gillet2003alt,
	Author = {Gillet, O.K. and Richard, G.},
	Booktitle = {Proc. of the 4th ISMIR Conf},
	Title = {Automatic Labelling of Tabla Signals},
	Url = {http://perso.enst.fr/~grichard/Publications/ismir03.pdf},
	Year = {2003},
}

@inproceedings{Van-Steelant:2004a,
	Author = {Van Steelant, D. and Tanghe, K. and Degroeve, S. and De Baets, B. and Leman, M. and Martens, J. P.},
	Booktitle = {Proceedings of the annual machine learning conference of Belgium and The Netherlands, Brussels, Belgium (2004)},
	Keywords = {percussion transcription, classification, SVM},
	Title = {Classification of percussive sounds using support vector machines},
	Url = {http://www.ipem.ugent.be/MAMI/Public/Papers/ClassificationPercussionSVM_Benelearn2004.pdf},
	Year = {2004},
	Abstract = {[No abstract]},
}

@inproceedings{Herrera:2003,
	Author = {Herrera, P. and Dehamel, A. and Gouyon, F.},
	Booktitle = {114th AES Convention},
	Title = {Automatic labeling of unpitched percussion sounds},
	Url = {http://www.iua.upf.es/mtg/publications/AES114-Herrera2003.PDF},
	Year = {2003},
	Abstract = {We present a large-scale study on the automatic classification of sounds from percussion instruments. Different 
subsets of temporal and spectral descriptors (up to 208) are used as features that several learning systems exploit to 
learn class partitions. More than thirty different classes of acoustic and synthetic instruments and near two- 
thousand different isolated sounds (i.e. not mixed with other ones) have been tested with ten-fold or holdout cross- 
validation. The best performance can be achieved with Kernel Density estimation (15% of errors), although boosted 
rule systems yielded similar figures. Multidimensional scaling of the classes provides a graphical and conceptual 
representation of the relationships between sound classes, and facilitates the explanation of some types of errors. 
We also explore several options to expand the sound descriptors beyond the class label, as for example the 
manufacturer-model label and confirm the feasibility of doing that. We finally discuss methodological issues 
regarding the generalization capabilities of usual experiments that have been done in this area. 
},
}

@article{Gillet:2004,
	Author = {Gillet, O. and Richard, G.},
	Journal = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing (ICASSP'04)},
	Title = {Automatic transcription of drum loops},
	Url = {http://ieeexplore.ieee.org/iel5/9248/29346/01326815.pdf?isnumber=&arnumber=1326815},
	Volume = {4},
	Year = {2004},
	Abstract = {Recent efforts in audio indexing and retrieval in music databases mostly focus on melody. If this is appropriate for polyphonic music signals, specific approaches are needed for systems dealing with percussive audio signals such as those produced by drums, tabla or djembe. Most studies of drum signal transcription focus on sounds taken in isolation. In this paper, we propose several methods for drum loop transcription where the drums signals dataset reflects the variability encountered in modern audio recordings (real and natural drum kits, audio effects, simultaneous instruments, etc.). The approaches described are based on hidden Markov models (HMM) and support vector machines (SVM). Promising results are obtained with a 83.9% correct recognition rate for a simplified taxonomy.},
}

@inproceedings{mcdonald:1997,
	Author = {McDonald, S. and Tsang, C. P.},
	Booktitle = {Departmental Conference, Yanchep},
	Title = {Percussive Sound Identification Experiments Using Spectral Centre Trajectories},
	Year = {1997},
	Abstract = {The spectral centre of a sound fragment can be thought of as the `centre of mass' of energy in a sound's spectrum. During the lifetime of a sound, this centre shifts with the changing spectral character of the sound. By tracing the movement of the spectral centre, it is possible to generate a curve or trajectory characterising the sound. This paper presents preliminary experiments investigating the utility of such trajectories for identifying similarity between a variety of percussive sounds. SCTs are shown to be able to classify sounds simply and effectively.},
}

@article{Lakatos:2000,
	Author = {Lakatos, S.},
	Journal = {Perception {\&} Psychophysics},
	Number = {7},
	Pages = {1426--39},
	Title = {A common perceptual space for harmonic and percussive timbres},
	Volume = {62},
	Year = {2000},
	Abstract = {The goal of a series of listening tests was to better isolate the principal dimensions of timbre, using a wide range of timbres and converging psychophysical techniques. Expert musicians and nonmusicians rated the timbral similarity of three sets of pitched and percussive instruments. Multidimensional scaling analyses indicated that both centroid and rise time comprise the principal acoustic factors across all stimulus sets and that musicians and nonmusicians did not differ significantly in their weighting of these factors. Clustering analyses revealed that participants also categorized percussive and, to a much lesser extent, pitched timbres according to underlying physical-acoustic commonalties. The findings demonstrate that spectral centroid and rise time represent principal perceptual dimensions of timbre, independent of musical training, but that the tendency to group timbres according to source properties increases with acoustic complexity.},
}

@phdthesis{schloss1985atp,
	Author = {Schloss, W.A.},
	School = {Stanford University},
	Title = {On the automatic transcription of percussive music: from acoustic signal to high-level analysis},
	Url = {http://ccrma.stanford.edu/STANM/stanms/stanm27/stanm27.pdf},
	Year = {1985},
}

@inproceedings{Kitahara:2006,
	Author = {Kitahara, T., Goto M. Komatani K. Ogata T. and Okuno, H.G.},
	Booktitle = {ICASSP 2006},
	Title = {INSTROGRAM: A NEW MUSICAL INSTRUMENT RECOGNITION TECHNIQUE WITHOUT USING ONSET DETECTION NOR F0 ESTIMATION},
	Year = {2006},
	Abstract = {This paper describes a new technique for recognizing musical instruments in polyphonic music. Because the conventional framework for musical instrument recognition in polyphonic music had to estimate the onset time and fundamental frequency (F0) of each note, instrument recognition strictly suffered from errors of onset detection and F0 estimation. Unlike such a note-based processing framework, our technique calculates the temporal trajectory of instrument existence probabilities for every possible F0, and the results are visualized with a spectrogram-like graphical representation called instrogram. The instrument existence probability is defined as the product of a nonspecific instrument existence probability calculated using PreFEst and a conditional instrument existence probability calculated using the hidden Markov model. Experimental results show that the obtained instrograms reflect the actual instrumentations and facilitate instrument recognition.},
}

@inproceedings{Gouyon:2007,
	Author = {Gouyon, F. and Dixon, S. and Widmer, G.},
	Booktitle = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing (ICASSP'07)},
	Title = {Evaluating low-level features for beat classification and tracking},
	Url = {http://www.inescporto.pt/~fgouyon/docs/GouyonDixonWidmer_ICASSP2007.pdf},
	Year = {2007},
	Abstract = {In this paper, we address the question of which low-level acoustical features are the most adequate for identifying music beats computationally. We consider 172 features computed on consecutive signal frames and systematically evaluate their individual value in the task of providing reliable cues for the presence and localisation of beats in music signals. We compare two ways of evaluating features: their accuracy in a song-specific classification task (classifying beats vs non-beats) and their performance as a front-end to a beat tracking system.},
}

@article{Scheirer:1998,
	Author = {Scheirer, E.D.},
	Journal = {Journal of the Acoustical Society of America},
	Month = {January},
	Number = {1},
	Pages = {588--601},
	Title = {Tempo and beat analysis of acoustic musical signals},
	Url = {http://www.iro.umontreal.ca/~pift6080/documents/papers/scheirer_jasa.pdf},
	Volume = {103},
	Year = {1998},
	Abstract = {A method is presented for using a small number of bandpass filters and banks of parallel comb filters to analyze the tempo of, and extract the beat from, musical signals of arbitrary polyphonic complexity and containing arbitrary timbres. This analysis is performed causally, and can be used predictively to guess when beats will occur in the future. Results in a short validation experiment demonstrate that the performance of the algorithm is similar to the performance of human listeners in a variety of musical situations. Aspects of the algorithm are discussed in relation to previous high-level cognitive models of beat tracking. },
}

@inproceedings{Herrera:2002,
	Author = {Herrera, P. and Yeterian, A. and Gouyon, F.},
	Booktitle = {Second International Conference on Music and Artificial Intelligence},
	Title = {Automatic classification of drum sounds: a comparison of feature selection methods and classification techniques},
	Url = {http://www.iua.upf.es/mtg/publications/ICMAI02-pherrera.pdf},
	Year = {2002},
	Abstract = {We present a comparative evaluation of automatic classification of a sound database containing more than six hundred drum sounds (kick, snare, hihat, toms and cymbals). A preliminary set of fifty descriptors has been refined with the help of different techniques and some final reduced sets including around twenty features have been selected as the most relevant. We have then tested different classification techniques (instance-based, statistical-based, and tree-based) using ten-fold cross-validation. Three levels of taxonomic classification have been tested: membranes versus plates (super-category level), kick vs. snare vs. hihat vs. toms vs. cymbals (basic level), and some basic classes (kick and snare) plus some sub-classes -i.e. ride, crash, open-hihat, closed hihat, high-tom, medium-tom, low-tom- (sub-category level). Very high hit-rates have been achieved (99%, 97%, and 90% respectively) with several of the tested techniques.
},
}

@inproceedings{Tindale:2004,
	Author = {Tindale, A. and Kapur, A. and Tzanetakis, G. and Fujinaga, I.},
	Booktitle = {ISMIR 2004},
	Keywords = {snare drum},
	Title = {RETRIEVAL OF PERCUSSION GESTURES USING TIMBRE CLASSIFICATION TECHNIQUES},
	Url = {http://www.ee.columbia.edu/~dpwe/ismir2004/CRFILES/paper235.pdf},
	Year = {2004},
	Abstract = {Musicians are able to recognise the subtle differences in timbre produced by different playing techniques on an instrument, yet there has been little research into achieving this with a computer. This paper will demonstrate an automatic system that can successfully recognise different timbres produced by different performance techniques and classify them using signal processing and classification tools. Success rates over 90% are achieved when classifying snare drum timbres produced by different playing techniques. 
},
}

@article{Markel:1972,
	Author = {Markel, J.},
	Isbn = {0018-9278},
	Ja = {Audio and Electroacoustics, IEEE Transactions on},
	Journal = {Audio and Electroacoustics, IEEE Transactions on},
	Keywords = {linear predictive coding, LPC analysis, formant trajectories},
	Number = {2},
	Pages = {129--137},
	Title = {Digital inverse filtering - a new tool for formant trajectory estimation},
	Ty = {JOUR},
	Url = {http://ieeexplore.ieee.org/iel6/8337/26091/01162367.pdf?isnumber=&arnumber=1162367},
	Volume = {20},
	Year = {1972},
	Abstract = {A new algorithm, based upon a digital inverse filter formulation, is presented and shown to be quite useful for estimating resonance or formant structure of voiced speech. The output of the algorithm is a set of raw data corresponding to peak frequencies versus time which is then used to estimate the first three and sometimes four continuously varying formant trajectories. Although an algorithm for automatically extracting the formants from the raw data is not presented here, for nearly 90 percent of the time an automatic decision algorithm is trivial, namely, the first three peaks of the reciprocal of the inverse filter spectrum define the first three formants.},
}

@inbook{Rasmussen:2006ch2,
	Author = {Rasmussen, Carl Edward and Williams, Christopher K. I.},
	Chapter = {2},
	Keywords = {Gaussian processes, Machine learning},
	Publisher = {MIT Press},
	Title = {Gaussian Processes for Machine Learning},
	Url = {http://mitpress.mit.edu/books/chapters/026218253Xchap2.pdf},
	Year = {2006},
}

@inbook{Rasmussen:2006ch1,
	Author = {Rasmussen, Carl Edward and Williams, Christopher K. I.},
	Chapter = {1},
	Keywords = {Gaussian processes, Machine learning},
	Publisher = {MIT Press},
	Title = {Gaussian Processes for Machine Learning},
	Url = {http://mitpress.mit.edu/books/chapters/026218253Xchap1.pdf},
	Year = {2006},
}

@inbook{Williams.:2002,
	Author = {Williams., C. K. I.},
	Chapter = {Gaussian processes},
	Edition = {2nd},
	Editor = {Arbib, M. A.},
	Keywords = {Gaussian processes, Machine learning},
	Pages = {466-470},
	Publisher = {The MIT Press},
	Title = {Handbook of Brain Theory and Neural Networks},
	Url = {http://www.dai.ed.ac.uk/homes/ckiw/postscript/hbtnn.ps.gz},
	Year = {2002},
}

@inbook{Mackay:1998,
	Address = {Berlin},
	Author = {Mackay, D.J.C.},
	Chapter = {Introduction to Gaussian Processes},
	Editor = {Bishop, C.M.},
	Keywords = {Machine learning, Gaussian processes},
	Pages = {133--165},
	Publisher = {Springer},
	Series = {NATO ASI Series},
	Title = {Neural Networks and Machine Learning},
	Url = {http://www.inference.phy.cam.ac.uk/mackay/gpB.pdf},
	Urldate = {2006},
	Volume = {168},
	Year = {1998},
}

@article{langley1992abc,
	Author = {Langley, P. and Iba, W. and Thompson, K.},
	Journal = {Proceedings of the Tenth National Conference on Artificial Intelligence},
	Pages = {228},
	Publisher = {Menlo Park, USA: AAAI Press},
	Title = {An analysis of {B}ayesian classifiers},
	Url = {http://scholar.google.com/url?sa=U&q=http://homepage.westmont.edu/iba/pubs/wbayes.ps.gz},
	Volume = {223},
	Year = {1992},
}

@inproceedings{kohavi96,
	Author = {Kohavi, Ron},
	Booktitle = {Proceedings of the Second International Conference on Knowledge Discovery and Data Mining},
	Keywords = {decision trees, WEKA},
	Pages = {202--207},
	Title = {Scaling Up the Accuracy of {N}aive-{B}ayes Classifiers: a Decision-Tree Hybrid},
	Year = {1996},
}

@book{Witten:2005,
	Author = {Witten, I. H. and Frank, E.},
	Edition = {2nd},
	Keywords = {WEKA},
	Publisher = {Morgan Kaufmann},
	Title = {Data Mining: Practical machine learning tools and technique},
	Year = {2005},
}

@inproceedings{Harte:2005,
	Author = {Harte, C., Sandler M.B.},
	Booktitle = {Audio Engineering Society Convention},
	Month = {May},
	Organization = {Audio Engineering Society},
	Title = {Automatic Chord Identification Using a Quantised Chromagram},
	Year = {2005},
	Abstract = {This paper presents an approach to the problem of identifying musical chords from audio recordings. In our approach, a tuning algorithm is applied to a 36-bin chromagram to accurately locate the boundaries between semitones. This allows the calculation of a 12-bin semitone-quantised chromagram, which can then be compared with a set of predefined chord templates in order to generate a sequence of chord estimates. The performance of our method is evaluated by comparing the results with a test database of hand-labelled pieces taken from the Beatles back catalogue, from which the initial results are encouraging. The paper concludes with a discussion of some possible improvements to the algorithms presented.},
}

@article{Nwe:2007,
	Author = {Nwe, T.L. and Li, H.},
	Doi = {10.1109/TASL.2006.876756},
	Journal = {Audio, Speech, and Language Processing, IEEE Transactions on},
	Keywords = {vibrato, singing voice, singer ID, Hidden Markov Models},
	Title = {Exploring Vibrato-Motivated Acoustic Features for Singer Identification},
	Url = {http://ieeexplore.ieee.org/iel5/10376/32978/101109TASL2006876756.pdf?isnumber=32978&arnumber=101109TASL2006876756},
	Year = {2007},
	Abstract = {Vibrato is a slightly tremulous effect imparted to vocal or instrumental tone for added warmth and expressiveness through slight variation in pitch. It corresponds to a periodic fluctuation of the fundamental frequency. It is common for a singer to develop a vibrato function to personalize his/her singing style. In this paper, we explore the acoustic features that reflect vibrato information in order to identify singers of popular music. We start with an enhanced vocal detection method that allows us to select vocal segments with high confidence. From the selected vocal segments, the cepstral coefficients which reflect the vibrato characteristics are computed. These coefficients are derived using bandpass filters, such as parabolic and cascaded bandpass filters, spread according to the octave frequency scale. The strategy of our classifier formulation is to utilize the high level musical knowledge of song structure in singer modeling. Singer identification is validated on a database containing 84 popular songs from commercially available CD recordings from 12 singers. We achieve an average error rate of 16.2% in segment level identification.},
}

@article{Christensen:1976,
	Author = {Christensen, R. and Strong, W. and Palmer, E.},
	Isbn = {0096-3518},
	Journal = {IEEE Transactions on Acoustics, Speech, and Signal Processing},
	Keywords = {LPC analysis, linear predictive coding, formant tracking, formant},
	Number = {1},
	Pages = {8--14},
	Title = {A comparison of three methods of extracting resonance information from predictor-coefficient coded speech},
	Ty = {JOUR},
	Url = {http://ieeexplore.ieee.org/iel6/29/26116/01162767.pdf?isnumber=&arnumber=1162767},
	Volume = {24},
	Year = {1976},
	Abstract = {Three methods of extracting resonance information from predictor-coefficient coded speech are compared. The methods are finding roots of the polynomial in the denominator of the transfer function using Newton iteration, picking peaks in the spectrum of the transfer function, and picking peaks in the negative of the second derivative of the spectrum. A relationship was found between the bandwidth of a resonance and the magnitude of the second derivative peak. Data, accumulated from a total of about two minutes of running speech from both female and male talkers, are presented illustrating the relative effectiveness of each method in locating resonances. The second-derivative method was shown to locate about 98 percent of the significant resonances while the simple peak-picking method located about 85 percent.},
}

@article{Thomas:1997,
	Author = {Thomas, B. and Arani, F. and Honary, B.},
	Isbn = {0013-5194},
	Ja = {Electronics Letters},
	Journal = {Electronics Letters},
	Keywords = {convergence of numerical methods, iterative methods, linear predictive coding, polynomials, speech processing, LPC analysis, accurate root estimation, algorithm, fixed time interval, iterations, speech model filter polynomial, speech model root location},
	Number = {5},
	Pages = {355--356},
	Title = {Algorithm for speech model root location},
	Ty = {JOUR},
	Url = {http://ieeexplore.ieee.org/iel5/2220/12611/00581016.pdf?isnumber=&arnumber=581016},
	Volume = {33},
	Year = {1997},
	Abstract = {A novel algorithm for locating the roots of the speech model filter polynomial derived from LPC analysis is proposed. The technique is guaranteed to converge and can be implemented in a fixed time interval. When several iterations are performed, very accurate root estimation can be achieved},
}

@article{Jinhai:1993,
	Author = {Jinhai, C., Gangji J. Lihe Z.},
	Journal = {Electronics Letters},
	Month = {Nov},
	Number = {24},
	Pages = {2081-2082},
	Title = {New method for extracting speech formants using LPC phase spectrum},
	Url = {http://ieeexplore.ieee.org/iel1/2220/6337/00247580.pdf?isnumber=6337∏=STD&arnumber=247580&arnumber=247580&arSt=2081&ared=2082&arAuthor=Cai+Jinhai%3B+Jiang+Gangji%3B+Zhang+Lihe},
	Volume = {29},
	Year = {1993},
	Abstract = {A new method for formant extraction using the LPC phase spectrum is proposed, which is especially effective in finding merged peaks. The bandwidth of a formant is easily calculated from the magnitude of the third derivative of the LPC phase spectrum},
}

@article{Kim:2006,
	Author = {Kim, C. and Seo, K. D. and Sung, W.},
	Journal = {EURASIP Journal on Applied Signal Processing},
	Keywords = {not-at-qm},
	Note = {doi:10.1155/ASP/2006/67960},
	Pages = {Article ID 67960, 16 pages},
	Title = {A Robust Formant Extraction Algorithm Combining Spectral Peak Picking and Root Polishing},
	Url = {http://www.hindawi.com/GetArticle.aspx?doi=10.1155/ASP/2006/67960&e=cta},
	Volume = {2006},
	Year = {2006},
}

@article{Souza:1977,
	Author = {de Souza, P.},
	Isbn = {0096-3518},
	Journal = {Acoustics, Speech, and Signal Processing},
	Number = {6},
	Pages = {554--559},
	Title = {Statistical tests and distance measures for LPC coefficients},
	Ty = {JOUR},
	Url = {http://ieeexplore.ieee.org/iel6/29/26128/01163004.pdf?isnumber=&arnumber=1163004},
	Volume = {25},
	Year = {1977},
	Abstract = {This paper considers the problem of comparing two sets of (LPC) coefficients or, more generally, that of comparing two short segments of speech via LPC techniques. It is shown that Itakura's prediction-residual ratio is intuitively unsatisfactory and theoretically misleading as a distance measure. Two slower, but more accurate statistical means of comparison are suggested, and these are supported by evidence from a simulation study.},
}

@inproceedings{Bozkurt:2004a,
	Author = {Bozkurt, B. and Doval, B. and D'Alessandro, C. and Dutoit, T.},
	Booktitle = {ICSLP 2004},
	Keywords = {phase, formant tracking},
	Title = {Improved Differential Phase Spectrum Processing For Formant Tracking},
	Url = {http://tcts.fpms.ac.be/publications/papers/2004/icslp2004_bbdcdtd3.pdf},
	Year = {2004},
	Abstract = {This study presents an improved version of our previously introduced formant tracking algorithm. The algorithm is based on processing the negative derivative of the argument of the chirp-z transform (termed as the differential phase spectrum) of a given speech signal. No modeling is included in the procedure but only peak picking on differential phase spectrum. We discuss the effects of roots of z-transform to differential phase spectrum and the need to ensure that all zeros are at some distance from the circle where chirp-z transform is computed. For that, we include an additional zero-decomposition step in our previously presented algorithm to improve its robustness. The final version of the algorithm is tested for analysis of synthetic speech and real speech signals and compared to two other formant tracking systems. },
}

@inproceedings{Bazzi:2003,
	Author = {Bazzi, I. O. and Acero, A. and Deng, L.},
	Booktitle = {Proceedings of the International Conference on Acoustics, Speech, and Signal Processing (ICASSP '03)},
	Isbn = {1520-6149},
	Keywords = {least mean squares methods, maximum likelihood estimation, nonlinear estimation, optimisation, search problems, speech processing, speech recognition, table lookup, EM training, MAP method, MMSE, Switchboard corpus, Viterbi search, acoustic feature space, acoustic observations, expectation maximization approach, first order continuity constraints, formant frequencies, formant tracking, formant trajectories, mapping, parameter-free nonlinear predictor, predictor codebook, quantization, residual capture, speech signal decomposition},
	Pages = {I-464--I-467},
	Title = {An expectation maximization approach for formant tracking using a parameter-free non-linear predictor},
	Ty = {CONF},
	Url = {http://ieeexplore.ieee.org/iel5/8535/26983/01198818.pdf?isnumber=&arnumber=1198818},
	Volume = {1},
	Year = {2003},
	Abstract = {This paper presents a new approach for formant tracking using a parameter-free non-linear predictor that maps formant frequencies and bandwidths into the acoustic feature space. The approach relies on decomposing the speech signal into two components: the first component captures the mapping between formants and acoustic observations, while the second component is intended to capture the residual in the signal. We build the mapping by quantizing the formant space and creating a predictor codebook. Formant tracking is achieved by: (1) EM training of the parameters of the residual component, and (2) searching the predictor codebook for the best formant values. We explore both MAP and MMSE methods for performing formant tracking with the proposed approach. Furthermore, we impose first order continuity constraints on formant trajectories, and use Viterbi search to perform formant tracking. We present formant tracking results on data from the Switchboard corpus.},
}

@article{McCandless:1974,
	Author = {McCandless, S.},
	Booktitle = {Acoustics, Speech, and Signal Processing},
	Journal = {Acoustics, Speech, and Signal Processing},
	Keywords = {formant tracking},
	Number = {2},
	Pages = {135- 141},
	Title = {An algorithm for automatic formant extraction using linear prediction spectra},
	Url = {http://ieeexplore.ieee.org/iel6/29/26103/01162559.pdf?isnumber=&arnumber=1162559},
	Volume = {22},
	Year = {1974},
	Abstract = {An algorithm is presented which finds the frequency and amplitude of the first three formants during all vowel-like segments of continuous speech. It uses as input the peaks of the linear prediction spectra and a segmentation parameter to indicate energy and voicing. Ideally, the first three peaks are the first three formants. Frequently, however, two peaks merge, or spurious peaks appear, and the difficult part is to recognize such situations and deal with them. The general method is to fill formant slots with the available peaks at each frame, based on frequency position relative to an educated guess. Then, if a peak is left over and/or a slot is unfilled, special routines are called to decide how to deal with them. Included is a formant enhancement technique, analogous to a similar technique which has been implemented via the chirp-z transform [8], which usually succeeds in separating two merged formants. Processing begins at the middle of each high volume voiced segment, where formants are most likely to be correct, and branches outward from there in both directions in time, using the most recently found formant frequencies as the educated guess for the current frame. The algorithm has been implemented at Lincoln Laboratory on the Univac 1219 and the Fast Digital Processor, a programmable processor [9], and has been tested on a large number of unrestricted sentences.},
}

@inproceedings{Wrench:1995,
	Author = {Wrench, Alan A.},
	Booktitle = {Proceedings of the XIIIth International Congress of Phonetic Sciences},
	Pages = {460-463},
	Title = {Analysis of fricatives using multiple centres of gravity},
	Volume = {4},
	Year = {1995},
}

@article{makhoul1975lpt,
	Author = {Makhoul, J.},
	Journal = {Proceedings of the IEEE},
	Number = {4},
	Pages = {561--580},
	Title = {Linear prediction: A tutorial review},
	Url = {http://ieeexplore.ieee.org/iel5/5/31191/01451722.pdf?isnumber=&arnumber=1451722},
	Volume = {63},
	Year = {1975},
	Abstract = {This paper gives an exposition of linear prediction in the analysis of discrete signals. The signal is modeled as a linear combination of its past values and present and past values of a hypothetical input to a system whose output is the given signal. In the frequency domain, this is equivalent to modeling the signal spectrum by a pole-zero spectrum. The major part of the paper is devoted to all-pole models. The model parameters are obtained by a least squares analysis in the time domain. Two methods result, depending on whether the signal is assumed to be stationary or nonstationary. The same results are then derived in the frequency domain. The resulting spectral matching formulation allows for the modeling of selected portions of a spectrum, for arbitrary spectral shaping in the frequency domain, and for the modeling of continuous as well as discrete spectra. This also leads to a discussion of the advantages and disadvantages of the least squares error criterion. A spectral interpretation is given to the normalized minimum prediction error. Applications of the normalized error are given, including the determination of an 'optimal' number of poles. The use of linear prediction in data compression is reviewed. For purposes of transmission, particular attention is given to the quantization and encoding of the reflection (or partial correlation) coefficients. Finally, a brief introduction to pole-zero modeling is given.},
}

@inproceedings{Weruaga:2006,
	Author = {Weruaga, L.},
	Booktitle = {IEEE TRANSACTIONS ON SIGNAL PROCESSING},
	Title = {Spectral All-Pole Estimation},
	Url = {http://www.viskom.oeaw.ac.at/~luis/weruaga/publications/double-column.pdf},
	Year = {2006},
	Abstract = {Autoregressive (AR) modeling is a widely popular spectral analysis technique commonly resolved in a time-domain framework. This paper presents a novel analysis and algorithmic framework for estimating poles directly from spectral samples. The explicit goal is the minimization of the squared spectral residual between the spectral samples and the all-pole model, this last one described by the poles (instead of AR coefficients). This objective results in a non-linear convex functional containing a global minimum that is equivalent to that delivered by the time-domain autocorrelation method. The solution is obtained with a Newton--Raphson mechanism of fast convergence. The paper addresses also other types of functionals and its equivalence to the Maximum Likelihood estimation. The statistical performance and convergence properties of the method are demonstrated with simulations of practical scenarios, such as all-pole fitting on stochastic and deterministic harmonic signals. },
}

@inproceedings{paliwal2003ups,
	Author = {Paliwal, K.K. and Alsteris, L.},
	Booktitle = {Eurospeech 2003},
	Title = {Usefulness of phase spectrum in human speech perception},
	Url = {http://maxwell.me.gu.edu.au/spl/publications/papers/euro03_leigh.pdf},
	Year = {2003},
	Abstract = {Short-time Fourier transform of speech signal has two components: magnitude spectrum and phase spectrum. In this paper, relative importance of short-time magnitude and phase spectra on speech perception is investigated. Human perception experiments are conducted to measure intelligibility of speech tokens synthesized either from magnitude spectrum or phase spectrum. It is traditionally believed that magnitude spectrum plays a dominant role for shorter windows (20-30 ms); while phase spectrum is more important for longer windows (128-3500 ms). It is shown in this paper that even for shorter windows, phase spectrum can contribute to speech intelligibility as much as the magnitude spectrum if the shape of the window function is properly selected. },
}

@article{yegnanarayana1992sgd,
	Author = {Yegnanarayana, B. and Murthy, H.A.},
	Doi = {10.1109/78.157227},
	Journal = {IEEE Transactions on Signal Processing},
	Number = {9},
	Pages = {2281--2289},
	Title = {Significance of group delay functions in spectrum estimation},
	Url = {http://ieeexplore.ieee.org/iel4/78/4075/00157227.pdf?tp=&arnumber=157227&isnumber=4075},
	Volume = {40},
	Year = {1992},
	Abstract = {A method of spectrum estimation using group delay functions is proposed. This method exploits the additive property of the Fourier transform (FT) phase to extract spectral information of the signal in the presence of noise. The phase is generally featureless due to random polarity and wrappings, but the group delay function can be processed to derive significant information such as peaks in the spectral envelope. In the resulting spectral estimates obtained the resolution properties of the periodogram estimate are preserved while the variance is reduced. Variance caused by the sidelobe leakage due to windows and additive noise are significantly reduced even in the spectral estimate obtained using a single realization of the observation peak. Resolution is primarily dictated by the size of the data window. The method works even for high noise levels. The results of this procedure are demonstrated through two illustrative examples: estimation of sinusoids in noise and estimation of the narrowband autoregressive process in noise},
}

@article{Kepesi:2006,
	Author = {K{\'e}pesi, M., Weruaga L.},
	Doi = {10.1016/j.specom.2005.08.004},
	Journal = {Speech Communication},
	Keywords = {chirp, speech analysis},
	Month = {May},
	Number = {5},
	Pages = {474--492},
	Title = {Adaptive chirp-based timefrequency analysis of speech signals},
	Url = {http://www.viskom.oeaw.ac.at/~luis/weruaga/publications/specom1497.pdf},
	Volume = {48},
	Year = {2006},
	Abstract = {In this paper a new method for time-frequency analysis of speech signals is proposed. Given that the fundamental frequency of voiced speech often undergoes rapid fluctuation and that in these cases the classical spectrogram suffers from blurring and artifacts, an adaptive analysis basis composed of quadratic chirps is what we consider. The analysis basis of the proposed Short-Time Fan-Chirp Transform (FChT) is defined univocally by the analysis window length and by the frequency variation rate, this parameter being predicted from the last computed spectral segments. The prediction algorithm is based on time-tracking the joint trajectory of the harmonic contours, this process providing also a voiced/unvoiced detection parameter. Comparative results between the proposed Short-time FChT and popular time-frequency techniques reveal an improvement in spectral and time-frequency representation. Since the signal can be synthesized from its FChT, the proposed method is suitable for filtering purposes.},
}

@inproceedings{Weruaga:2004,
	Author = {Weruaga, L., K{\'e}pesi M.},
	Booktitle = {Eusipco 2004 (12th European Signal Processing Conference)},
	Title = {Speech Analaysis with the Fast Chirp Transform},
	Url = {http://www.eurasip.org/content/Eusipco/2004/defevent/papers/cr1374.pdf},
	Year = {2004},
	Abstract = {The Chirp transform is a powerful analysis tool for variable frequency signals such as speech. The computational load represents the main limitation of its original formulation, discouraging its use in real-time applications. This paper analyzes a fast implementation, based on performing time-warping on the signal under analysis, combined with the Fast Fourier Transform. The performance of the Fast Chirp transform depends on the one hand on the estimation of the time-warping operation based on the signal characteristics, and, on the other hand on the interpolation technique used for the warping. Observations from the analysis of speech signals support the method and the further lines. 
},
}

@inproceedings{Duda:2004,
	Author = {Duda, K.},
	Booktitle = {Eusipco 2004 (12th European Signal Processing Conference)},
	Keywords = {FFT, fixed-point},
	Title = {INTEGER FAST FOURIER TRANSFORM - IMPLEMENTATION AND APPLICATION},
	Url = {http://www.eurasip.org/content/Eusipco/2004/defevent/papers/cr1295.pdf},
	Year = {2004},
	Abstract = {Transforms that maps integers to integers have advantages not available for floating point transforms. Integer computations are faster and free from round off errors (integer transforms are widely used for lossless coding). The paper presents an algorithm of Integer Fast Fourier Transform (intFFT) based on lifting factorization. Integer Discrete Cosine Transform (intDCT) is also computed as an example of intFFT application.  
},
}

@inproceedings{Bozkurt:2004,
	Author = {Bozkurt, B. and Doval, B. and D'Alessandro, C. and Dutoit, T.},
	Booktitle = {Eusipco 2004 (12th European Signal Processing Conference)},
	Title = {APPROPRIATE WINDOWING FOR GROUP DELAY ANALYSIS AND ROOTS OF Z-TRANSFORM OF SPEECH SIGNALS},
	Url = {http://www.eurasip.org/content/Eusipco/2004/defevent/papers/cr1125.pdf},
	Year = {2004},
	Abstract = {This study discusses the difficulties of phase spectrum analysis of speech signals and shows that appropriate windowing is very crucial for obtaining reliable phase spectra. The main difficulties of phase based analysis stem from the domination of spiky effects of roots (zeros) of the signal z-transform close to the unit circle. We show how this problem is linked to windowing by discussing zero-patterns for speech signals. Once windowing is performed properly, group delay functions are much less noisy and reveal clearly formant information.},
}

@inproceedings{Zapata:2004,
	Author = {Zapata, J.G., Mart{\'\i}n J.C.D. Vilda P.G.},
	Booktitle = {Eusipco 2004 (12th European Signal Processing Conference)},
	Keywords = {formant, linear predictive coding, speech analysis; formant tracking},
	Title = {Fast formant estimation by complex analysis of LPC coefficients},
	Url = {http://www.eurasip.org/content/Eusipco/2004/defevent/papers/cr1750.pdf},
	Year = {2004},
	Abstract = {This paper describes a selective root finding method based in residue theory. It can find the poles of the LPC model of the speech signal close to the unit circle, without spend computations with the lesser significant internal poles. This feature makes it faster than generic root-finding methods, if the founded poles complain certain specifications. With a high order LPC model, the selected poles are in better correspondence with the formants than the local maxima of the spectral envelope. Experimental results are showed.},
}

@techreport{Frank:1999,
	Author = {Frank, E., Witten I.H.},
	Institution = {Department of Computer Science, The University of Waikato, Hamilton},
	Keywords = {decision trees},
	Number = {99/10},
	Title = {Reduced-error pruning with significance tests},
	Year = {1999},
	Abstract = {When building classification models, it is common practice to prune them to counter spurious effects of the training data: this often improves performance and reduces model size. 'Reduced-error pruning' is a fast pruning procedure for decision trees that is known to produce small and accurate trees. Apart from the data from which the tree is grown, it uses an independent 'pruning' set, and pruning decisions are based on the model's error rate on this fresh data. Recently it has been observed...},
}

@electronic{wekaexptut353,
	Author = {Scuse, D., Reutemann P.},
	Keywords = {WEKA},
	Title = {WEKA Experimenter Tutorial for Version 3-5-3},
	Url = {http://kent.dl.sourceforge.net/sourceforge/weka/ExperimenterTutorial-3.5.3.pdf},
	Urldate = {1st November 2006},
	Year = {2006},
}

@electronic{wekauserguide353,
	Author = {Kirkby, R., Frank E.},
	Keywords = {WEKA},
	Title = {WEKA Explorer User Guide for Version 3-5-3},
	Url = {http://kent.dl.sourceforge.net/sourceforge/weka/ExplorerGuide-3.5.3.pdf},
	Urldate = {1st November 2006},
	Year = {2006},
}

@inproceedings{Davis:1980,
	Author = {Davis, S.B., Mermelstein P.},
	Booktitle = {IEEE TRANSACTIONS ON ACOUSTICS, SPEECH, AND SIGNAL PROCESSING},
	Keywords = {MFCC},
	Pages = {357--366},
	Title = {Comparison of Parametric Representations for Monosyllabic Word Recognition in Continuously Spoken Sentences},
	Url = {http://ieeexplore.ieee.org/iel6/29/26147/01163420.pdf?isnumber=26147&prod=STD&arnumber=1163420&arnumber=1163420&arSt=+357&ared=+366&arAuthor=Davis%2C+S.%3B+Mermelstein%2C+P.},
	Year = {1980},
	Abstract = {Several  parametric  representations of the  acoustic signal were  compared  with regard to word  recognition  performance  in  a syllable-oriented  continuous  speech  recognition system. The  vocabulary  included  many  phonetically similar monosyllabic  words,  therefore the  emphasis was on  the  ability to retain  phonetically  significant acoustic  information  in the face of  syntactic and duration variations.

For  each  parameter  set (based on a  mel-frequency  cepstrum,  a  linear frequency  cepstrum,  a  linear  prediction  cepstrum,  a  linear  prediction spectrum,  or  a  set of reflection  coefficients),  word  templates  were generated using an efficient  dynamic  warping  method,  and  test  data  were time  registered  with  the  templates. A set of ten  mel-frequency cepstrum  coefficients  computed every 6.4 ms resulted in the best performance,  namely 96.5 percent  and 95.0  percent  recognition  with  each of two speakers. The  superior  performance of the  mel-frequency cepstrum  coefficients  may be attributed to the  fact  that  they  better  represent  the  perceptually relevant aspects of the  short-term speech spectrum.},
}

@inproceedings{Tzanetakis:2001,
	Author = {Tzanetakis, G. and Essl, G. and Cook, P.},
	Booktitle = {Proc. Int. Symposium on Music Information Retrieval (ISMIR)},
	Title = {Automatic Musical Genre Classification of Audio Signals},
	Url = {http://www.cs.uvic.ca/~gtzan/work/pubs/ismir01gtzan.pdf},
	Year = {2001},
	Abstract = {Musical genres are categorical descriptions that are used to describe music. They are commonly used to structure the increasing amounts of music available in digital form on the Web and are important for music information retrieval. Genre categorization for audio has traditionally been performed manually. A particular musical genre is characterized by statistical properties related to the instrumentation, rhythmic structure and form of its members. In this work, algorithms for the automatic genre categorization of audio signals are described. More specifically, we propose a set of features for representing texture and instrumentation. In addition a novel set of features for representing rhythmic structure and strength is proposed. The performance of those feature sets has been evaluated by training statistical pattern recognition classifiers using real world audio collections. Based on the automatic hierarchical genre classification two graphical user interfaces for browsing and interacting with large audio collections have been developed.},
}

@inproceedings{Paulus:2002,
	Author = {Paulus, J., Klapuri A.},
	Booktitle = {ISMIR 2002},
	Title = {Measuring the Similarity of Rhythmic Patterns},
	Url = {http://www.cs.tut.fi/sgn/arg/music/ismir2002_paulus.pdf},
	Year = {2002},
	Abstract = {A system is described which measures the similarity of two arbitrary rhythmic patterns. The patterns are represented as acoustic signals, and are not assumed to have been performed with similar sound sets. Two novel methods are presented that constitute the algorithmic core of the system. First, a probabilistic musical meter estimation process is described, which segments a continuous musical signal into patterns. As a side-product, the method outputs tatum, tactus (beat), and measure lengths. A subsequent process performs the actual similarity measurements. Acoustic features are extracted which model the fluctuation of loudness and brightness within the pattern, and dynamic time warping is then applied to align the patterns to be compared. In simulations, the system behaved consistently by assigning high similarity measures to similar musical rhythms, even when performed using different sound sets.},
}

@inproceedings{Abdallah:2003a,
	Author = {Abdallah, S. A. and Plumbley, M. D.},
	Booktitle = {4th International Symposium on Independent Component Analysis and Blind Signal Separation (ICA2003)},
	Keywords = {ICA, onset detection},
	Month = {April},
	Pages = {233--238},
	Title = {Probability as metadata: Event detection in music using {ICA} as a conditional density model},
	Year = {2003},
	Abstract = {We consider the problem of detecting note onsets in music under the hypothesis that the onsets, and events in general, are essentially surprising moments, and that event detection should therefore be based on an explicit probability model of the sensory input, which generates a moment-by-moment trace of the probability of each observation as it is made. Relatively unexpected events should thus appear as clear spikes. In this way, several well known methods of onset detection can be understood in terms of an implicit probability model. We apply ICA to the problem as an adaptive non-Gaussian model, and investigate the use of ICA as a conditional probability model. The results obtained using several methods on two extracts of piano music are presented and compared. Finally, we tentatively suggest an information theoretic interpretation of the approach.
},
}

@techreport{Peeters:2004,
	Author = {Peeters, G.},
	Institution = {IRCAM},
	Title = {A Large Set of Audio Features for Sound Description},
	Url = {http://recherche.ircam.fr/equipes/analyse-synthese/peeters/ARTICLES/Peeters_2003_cuidadoaudiofeatures.pdf},
	Year = {2004},
}

@inproceedings{Paulus:2006,
	Author = {Paulus, J.},
	Booktitle = {ICASSP 2006},
	Keywords = {Hidden Markov Models, percussion transcription},
	Title = {Acoustic modelling of drum sounds with Hidden Markov Models for music transcription},
	Year = {2006},
	Abstract = {This paper describes two methods for applying hidden Markov models (HMMs) to acoustic modelling of drum sound events for polyphonic music transcription. The proposed methods are instrumentwise binary modelling and modelling of instrument combinations. In the first, each target instrument is modelled with a 'sound' model and all target instruments share a 'silence' model. Each instrument is transcribed independently from the others. In the latter method, different instrument combinations are modelled, and an additional 'silence' model is created. The proposed methods are evaluated with simulations with acoustic data, and compared with two reference methods. Simulations show that combination modelling performs better than instrument-wise modelling.},
}

@inproceedings{Paulus:2003,
	Author = {Paulus, J. and Klapuri, A.},
	Booktitle = {Proc. of the 6th Int. Conference on Digital Audio Effects (DAFX-03)},
	Keywords = {onset detection, beatboxing, percussion transcription},
	Title = {Model-based event labelling in the transcription of percussive audio signals.},
	Url = {http://www.cs.tut.fi/sgn/arg/music/dafx03_paulus.pdf},
	Year = {2003},
	Abstract = {In this paper we describe a method for the transcription of percussive audio signals which have been performed with arbitrary nondrum sounds. The system locates sound events from the input signal using an onset detector. Then a set of features is extracted from the onset times. Feature vectors are clustered and the clusters are assigned with labels which describe the rhythmic role of each event. For the labeling, a novel method is proposed which is based on metrical (temporal) positions of the sound events within the measures. The system is evaluated using monophonic percussive tracks consisting of non-drum sounds. In simulations, the system achieved a total error rate of 33.7%. Demo signals are available at URL:<http://www.cs.tut.fi/~paulus/demo/>.
},
}

@electronic{Jones:2005,
	Author = {Jones, D.},
	Keywords = {QMF},
	Title = {Quadrature Mirror Filterbanks (QMF)},
	Url = {http://cnx.org/content/m12770/1.2/},
	Year = {2005},
}

@inproceedings{Duxbury:2004,
	Author = {Duxbury, C. and Bello, J.P. and M., Sandler and Davies, M.},
	Booktitle = {Proceedings of the 7th Conference on Digital Audio Effects (DAFx-04).},
	Keywords = {subband onset detection, QMF,; subbands, onset detector timing},
	Title = {A comparison between fixed and multiresolution analysis for onset detection in musical signals.},
	Url = {http://www.elec.qmul.ac.uk/people/miked/documents/Duxburyetal_dafx04.pdf},
	Year = {2004},
	Abstract = {A study is presented for the use of multiresolution analysis-based onset detection in the complex domain. It shows that using variable time-resolution across frequency bands generates sharper detection functions for higher bands and more accurate detection functions for lower bands. The resulting method improves the localisation of onsets on fixed-resolution schemes, by favouring the increased time precision of higher subbands during the combination of results. 
},
}

@inproceedings{Han:2006,
	Author = {Han, Wei and Chan, Cheong-Fat and Choy, Chiu-Sing and Pun, Kong-Pang},
	Ja = {Circuits and Systems, 2006. ISCAS 2006. Proceedings. 2006 IEEE International Symposium on},
	Journal = {Circuits and Systems, 2006. ISCAS 2006. Proceedings. 2006 IEEE International Symposium on},
	Keywords = {cepstral analysis, logic gates, speech recognition, MFCC extraction algorithm, MFCC extraction method, Mel-frequency cepstrum coefficients, logic gates, speech recognition},
	Pages = {4 pp},
	Title = {An efficient MFCC extraction method in speech recognition},
	Ty = {CONF},
	Url = {http://ieeexplore.ieee.org/iel5/11145/35661/01692543.pdf?isnumber=35661&arnumber=1692543},
	Year = {2006},
	Abstract = {This paper introduces a new algorithm of extracting MFCC for speech recognition. The new algorithm reduces the computation power by 53{\%} compared to the conventional algorithm. Simulation results indicate the new algorithm has a recognition accuracy of 92.93{\%}. There is only a 1.5{\%} reduction in recognition accuracy compared to the conventional MFCC extraction algorithm, which has an accuracy of 94.43{\%}. However, the number of logic gates required to implement the new algorithm is about half of the MFCC algorithm, which makes the new algorithm very efficient for hardware implementation.},
}

@article{569587,
	Address = {Beijing, China},
	Author = {Fang, Zheng and Guoliang, Zhang and Zhanjiang, Song},
	Issn = {1000-9000},
	Journal = {J. Comput. Sci. Technol.},
	Keywords = {MFCC,},
	Number = {6},
	Pages = {582--589},
	Publisher = {Institute of Computing Technology},
	Title = {Comparison of different implementations of MFCC},
	Url = {http://cst.cs.tsinghua.edu.cn/~fzheng/PAPERS/2001/0109E_JCST_MFCCCOMPARE_ZF.pdf},
	Volume = {16},
	Year = {2001},
	Abstract = {The performance of the Mel-Frequency Cepstrum Coefficients (MFCC) may be affected by (1) the number of filters, 
(2) the shape of filters, (3) the way that filters are spaced, and (4) the way that the power spectrum is warped. In this 
paper, several comparison experiments are done to find a best implementation. The traditional MFCC calculation 
excludes the 0th coefficient for the reason that it is regarded as somewhat unreliable. According to the analysis and 
experiments, the authors find that it can be regarded as the generalized frequency band energy (FBE) and is hence 
useful, which results in the FBE-MFCC. The authors also propose a better analysis, namely the auto-regressive 
analysis, on the frame energy, which outperforms its 1st and/or 2nd order differential derivatives. Experiments across 
the 863 Speech Database show that, compared with the traditional MFCC with its corresponding auto-regressive 
analysis coefficients, the FBE-MFCC and the frame energy with their corresponding auto-regressive analysis 
coefficients form the best combination, reducing the Chinese syllable error rate (CSER) by about 10.0%, while the 
FBE-MFCC with the corresponding auto-regressive analysis coefficients reducing CSER by 2.5%. Comparison 
experiments are also done across a quite casual Chinese speech database, named Chinese Annotated Spontaneous 
Speech (CASS) corpus, the FBE-MFCC can reduce the error rate by about 2.9% on an average. 
},
}

@inproceedings{Nordstrom:2006,
	Address = {Montreal, Quebec, Canada},
	Author = {Nordstrom, K. I. and Driessen, P. F.},
	Booktitle = {Proc. of the Int. Conf. on Digital Audio Effects (DAFx-06)},
	Keywords = {linear predictive coding, voice quality, preemphasis filter},
	Month = {Sept. 18--20,},
	Pages = {157-160},
	Title = {Variable Pre-Emphasis {LPC} for Modeling Vocal Effort in the Singing Voice},
	Url = {http://www.dafx.ca/proceedings/papers/p_157.pdf},
	Year = {2006},
	Abstract = {In speech and singing, the spectral envelope of the glottal source varies according to different voice qualities such as vocal effort, lax voice, and breathy voice. In contrast, linear prediction coding (LPC) models the glottal source in a way that is not flexible. The spectral envelope of the source estimated by LPC is fixed and determined by the pre-emphasis filter. In standard LPC, the formant filter captures variation in the spectral envelope that should be associated with the source. This paper presents variable pre-emphasis LPC (VPLPC) as a technique to allow the estimated source to vary. This results in formant filters that remain more consistent across variations in vocal effort and breathiness. VPLPC also provides a way to change the envelope of the estimated source, thereby changing the perception of vocal effort. The VPLPC algorithm is used to manipulate some voice excerpts with promising but mixed results. Possible improvements are suggested. },
}

@inproceedings{dafx06_p133,
	Address = {Montreal, Quebec, Canada},
	Author = {Dixon, S.},
	Booktitle = {Proc. of the Int. Conf. on Digital Audio Effects (DAFx-06)},
	Month = {Sept. 18--20,},
	Pages = {133-137},
	Title = {Onset Detection Revisited},
	Url = {http://www.dafx.ca/proceedings/papers/p_133.pdf},
	Year = {2006},
}

@inproceedings{dafx06_p101,
	Address = {Montreal, Quebec, Canada},
	Author = {Evangelista, Gianpaolo},
	Booktitle = {Proc. of the Int. Conf. on Digital Audio Effects (DAFx-06)},
	Keywords = {wavelets, fractals},
	Month = {Sept. 18--20,},
	Note = {additional resources at: http://staffwww.itn.liu.se/~giaev/soundexamples.html},
	Pages = {101-106},
	Title = {Fractal Modulation Effects},
	Url = {http://www.dafx.ca/proceedings/papers/p_101.pdf},
	Year = {2006},
}

@inproceedings{Janer:2006,
	Address = {Montreal, Quebec, Canada},
	Author = {Janer, J. and Bonada, J. and Blaauw, M.},
	Booktitle = {Proceedings of the International Conference on Digital Audio Effects (DAFx-06)},
	Keywords = {singing voice, singing voice synthesis},
	Month = {Sept. 18--20,},
	Pages = {41-44},
	Title = {Performance-Driven Control for Sample-Based Singing Voice Synthesis},
	Url = {http://www.dafx.ca/proceedings/papers/p_041.pdf},
	Year = {2006},
	Abstract = {In this paper we address the expressive control of singing voice synthesis. Singing Voice Synthesizers (SVS) traditionally require two types of inputs: a musical score and lyrics. The musical expression is then typically either generated automatically by applying a model of a certain type of expression to a high-level musical score, or achieved by manually editing low-level synthesizer parameters. We propose an alternative method, where the expression control is derived from a singing performance. In a first step, an analysis module extracts expressive information from the input voice signal, which is then adapted and mapped to the internal synthesizer controls. The presented implementation works in an off-line manner processing user input voice signals and lyrics using a phonetic segmentation module. The main contribution of this approach is to offer a direct way of controlling the expression of SVS. The further step is to run the system in real-time. The last section of this paper addresses a possible strategy for real-time operation. },
}

@article{Orio:2006,
	Author = {Orio, N.},
	Doi = {10.1561/1500000002},
	Journal = {Foundations and Trends in Information Retrieval},
	Keywords = {MIR},
	Month = {November},
	Number = {1},
	Pages = {1--90},
	Title = {Music Retrieval: A Tutorial and Review},
	Volume = {1},
	Year = {2006},
	Abstract = {The increasing availability of music in digital format needs to be matched by the development of tools for music accessing, filtering, classification, and retrieval. The research area of Music Information Retrieval (MIR) covers many of these aspects. The aim of this paper is to present an overview of this vast and new field. A number of issues, which are peculiar to the music language, are described---including forms, formats, and dimensions of music---together with the typologies of users and their information needs. To fulfil these needs a number of approaches are discussed, from direct search to information filtering and clustering of music documents. An overview of the techniques for music processing, which are commonly exploited in many approaches, is also presented. Evaluation and comparisons of the approaches on a common benchmark are other important issues. To this end, a description of the initial efforts and evaluation campaigns for MIR is provided.},
}

@inproceedings{Ishi:2004a,
	Author = {Ishi, C.T.},
	Booktitle = {Proceedings of The 8th International Conference of Speech and Language Processing 2004 (ICSLP 2004)},
	Pages = {941-944},
	Title = {A New Acoustic Measure for Aspiration Noise Detection},
	Url = {http://www.irc.atr.jp/~carlos/pdf/carlos-icslp200410.pdf},
	Volume = {II},
	Year = {2004},
	Abstract = {In this paper, we propose a new acoustic measure for detecting aspiration noise in vowels.  The measure is an index of synchronization between frequency bands around the first and third formants.  The measure is based on the principle that the vocal tract responses to the glottal excitation are synchronized between these frequency bands when aspiration noise is absent, and uncorrelated otherwise.  Evaluation results show that the proposed measure can be used together with spectral slope measures for automatic detection of aspiration noise. 
},
}

@inproceedings{Ishi:2005,
	Author = {Ishi, C. T. and Ishiguro, H. and Hagita, N.},
	Booktitle = {Proceedings of The 9th European Conference on Speech Communication and Technology},
	Pages = {481-484},
	Title = {Proposal of Acoustic Measures for Automatic Detection of Vocal Fry},
	Url = {http://www.irc.atr.jp/~carlos/pdf/carlos-eurospeech200509.pdf},
	Year = {2005},
	Abstract = {Vocal fry is a voice quality that often appears in relaxed voices indicating low tension, or in pressed voices expressing attitudes/feelings of surprise, admiration and suffering.  We propose a set of acoustic measures for automatically detecting vocal fry segments in speech utterances.  In order to deal with vocal fry utterances with very low fundamental frequencies, where classic short-term analysis methods become problematic, a glottal pulse synchronized method is proposed.  The acoustic measures are based on power, periodicity and similarity properties of vocal fry signals.  The basic idea is to scan for power peaks in a ``very short-term'' power contour, check for periodicity properties and evaluate a similarity measure between power peaks for deciding the possibility of vocal fry pulses.  Sub-harmonic properties are also taken into account in the periodicity analysis.  Evaluation of the proposed measures in automatic detection resulted in 73.3 % correct detection, with an insertion error rate of 3.9 %.},
}

@phdthesis{Brossier:2006,
	Author = {Brossier, P. M.},
	Month = {August},
	School = {Queen Mary, University of London},
	Title = {Automatic Annotation of Musical Audio for Interactive Applications},
	Url = {http://aubio.piem.org/phdthesis/},
	Year = {2006},
}

@article{Bello:2005,
	Author = {Bello, J. P. and Daudet, L. and Abdallah, S. and Duxbury, C. and Davies, M. and Sandler, M. B.},
	Isbn = {1063-6676},
	Ja = {Speech and Audio Processing, IEEE Transactions on},
	Journal = {IEEE Transactions on Speech and Audio Processing},
	Keywords = {Attack transcients, audio, note segmentation, novelty detection, Attack transcients, audio, note segmentation, novelty detection, wavelets},
	Number = {5},
	Pages = {1035--1047},
	Title = {A Tutorial on Onset Detection in Music Signals},
	Ty = {JOUR},
	Url = {http://ieeexplore.ieee.org/iel5/89/32132/01495485.pdf?isnumber=&arnumber=1495485},
	Volume = {13},
	Year = {2005},
	Abstract = {Note onset detection and localization is useful in a number of analysis and indexing techniques for musical signals. The usual way to detect onsets is to look for \'transient\' regions in the signal, a notion that leads to many definitions: a sudden burst of energy, a change in the short-time spectrum of the signal or in the statistical properties, etc. The goal of this paper is to review, categorize, and compare some of the most commonly used techniques for onset detection, and to present possible enhancements. We discuss methods based on the use of explicitly predefined signal features: the signal's amplitude envelope, spectral magnitudes and phases, time-frequency representations; and methods based on probabilistic signal models: model-based change point detection, surprise signals, etc. Using a choice of test cases, we provide some guidelines for choosing the appropriate method for a given application.},
}

@inproceedings{Abdallah:2003,
	Author = {Abdallah, S. and Plumbley, M. D.},
	Booktitle = {Cambridge Music Processing Colloquium},
	Keywords = {onset detection, ICA, Hidden Markov Models},
	Month = {March},
	Title = {Unsupervised onset detection: A probabilistic approach using {ICA} and a hidden {M}arkov classifier},
	Url = {http://www.elec.qmul.ac.uk/staffinfo/markp/2003/AbdallahPlumbley03-cmpc.pdf},
	Year = {2003},
	Abstract = {We describe an onset detection system that takes a two-stage approach, both of which are based on unsupervised learning in a probabilistic model.

The first stage uses independent component analysis (ICA) to fit a short-term non-Gaussian model to frames of audio data. This model is used to generate a reduced signal to be interpreted as the `surprisingness' of the original audio signal. Our hypothesis is that onsets and events generally are perceived as so because they are temporally localised surprises. 

The second stage uses a hidden Markov model (HMM) with Gaussian state-conditional densities to do unsupervised clustering of the `surprise' signal as represented in a multidimensional embedding space. The clusters which emerge in this space can be associated the presence or absence of an onset, and so a trivial decision based on the current HMM state can be used to drive an onset detector.
},
}

@inproceedings{duxbury02hybrid,
	Author = {Duxbury, C. and Sandler, M. and Davies, M.},
	Booktitle = {Proceedings of the DAFx Conference, Hamburg, Germany},
	Keywords = {onset detection},
	Pages = {33--38},
	Text = {Christopher Duxbury, Mark Sandler, and Mike Davis, A Hybrid Approach to Musical Note Onset Detection,},
	Title = {A Hybrid Approach to Musical Note Onset Detection},
	Year = {2002},
}

@article{Ishi:2004,
	Author = {Ishi, C.T.},
	Doi = {10.1250/ast.25.299},
	Journal = {Acoustical Science and Technology},
	Number = {4},
	Pages = {299-302},
	Title = {Analysis of Autocorrelation-based Parameters in Creaky Voice},
	Url = {http://www.irc.atr.jp/~carlos/pdf/carlos-asj2004-acousticletter.pdf},
	Volume = {25},
	Year = {2004},
}

@inproceedings{Goto:1999,
	Author = {Goto, M., Itou K. and Hayamizu, S.},
	Booktitle = {Eurospeech 99},
	Keywords = {Filled pause, Hesitation, Spontaneous speech},
	Title = {A real-time filled pause detection system for spontaneous speech recognition},
	Url = {http://staff.aist.go.jp/m.goto/PAPER/EUROSPEECH99.pdf},
	Year = {1999},
	Abstract = {This paper describes a method for automatically detecting filled (vocalized) pauses, which are one of the hesitation phenomena that current speech recognizers typically cannot handle. The detection of these pauses is important in spontaneous speech dialogue systems because they play valuable roles, such as helping a speaker keep a conversational turn, in oral communication. Although a few speech recognition systems have processed filled pauses within subword-based connected word recognition or word-spotting frameworks, they did not detect the pauses individually and consequently could not consider their roles. In this paper we propose a method that detects filled pauses and word lengthening on the basis of small fundamental frequency transition and small spectral envelope deformation under the assumption that speakers do not change articulator parameters during filled pauses. Experimental results for a Japanese spoken dialogue corpus show that our real-time filled-pause-detection system yielded a recall rate of 84.9% and a precision rate of 91.5%.},
}

@phdthesis{Masri1996thesis,
	Address = {UK},
	Author = {Masri, P.},
	Keywords = {onset detection},
	School = {University of Bristol},
	Title = {Computer Modeling of Sound for Transformation and Synthesis of Musical Signals},
	Year = {1996},
	Abstract = {The purpose of this thesis is to develop a sound model that can be used as a creative tool by 
professional musicians.  Post-production editing suites are used for compiling and arranging 
music tracks, and for creating soundtracks and voice-overs for the radio, television and film 
industries.  A sound model would bring a new dimension of flexibility to these systems, 
allowing the user to stretch and mould sounds as they please. 

Sound models already exist but they are limited both in their usability and in their scope for 
representation.  All aspects of the model in this thesis use designer-preset global variables 
which are transparent to the user.  Within this restriction and preserving manipulation 
flexibility, the aim of the thesis is to improve the range of sounds that can be modelled and the 
accuracy of modelling.  These are dependent on the choice of model elements and the accuracy 
of the analysis-resynthesis system (which translates between the playable time domain 
waveform and the controllable model feature domain, making the model usable). 

The basis of the model of this thesis is a deterministic-stochastic classification;  the partials of 
the harmonic structure of pitched sounds are individually represented in the deterministic 
aspect, whilst the stochastic aspect models the remainder as broadband noise.  Three studies 
were carried out to improve aspects of the analysis-resynthesis system.  These focus on: 

* the time-frequency representation, by which the analyser `sees' detail in the sound; 
* frame linking, which converts the instantaneous partial estimates into continuous 
trajectories -- this is essential for synthesis quality and for musical manipulation; 
* percussive note onsets, which are not represented in the existing models. 

The standard time-frequency representation for sound modelling, the Short-Time Fourier 
Transform, has limited resolution and is inadequate for capturing the detail of rapidly changing 
elements.  The first study examines the distortion it generates when it represents a 
nonstationary element and derives a method for extracting extra information from the 
distortion, thereby improving the effective resolution. 

The fact that partials belong to a harmonic structure is not considered in the existing `Nearest 
Frequency' method of frame linking;  the result is audible scrambling of the higher frequencies. 
The second study proposes using the harmonic structure as the basis for linking.  Although this 
is not a new concept, it is implemented in such a way that detail can be extracted from the 
harmonically weak start and end of harmonic regions, thereby improving synthesis quality. 

The existing model assumes all sound elements are slow-changing, so abrupt changes are 
poorly represented and sound diffused upon synthesis.  The third study finds a way of 
incorporating `attack transients' into the model.  The method pre-scans a sound for percussive 
onsets and synchronises both analysis and synthesis so as to avoid the previous problems.  The 
crispness of synthesised attack transients clearly demonstrate the effectiveness of this method. 

From many observations over the course of these studies, it became noticeable that the hard 
deterministic-stochastic classification was not capturing the `roughness' of some sounds 
accurately.  Further investigations revealed that detail is missing from the synthesised partials. 
A new basis for a sound model, termed here the Noisy Partial model, aims to rectify this by 
introducing the noisiness into the partials themselves.  In this new classification, deterministic 
and stochastic appear as opposite extremes on a continuously variable scale.  The new model 
promises a simplified structure and more efficient processing.  Suggestions are made for 
investigating this further as a future work direction.},
}

@inproceedings{Paul-Brossier:2004,
	Author = {Brossier, P. M. and Bello, J. P. and Plumbley, M. D.},
	Booktitle = {Proc. International Computer Music Conference (ICMC'04)},
	Keywords = {aubio, onset detection; real time},
	Pages = {458--461},
	Title = {Real-time temporal segmentation of note objects in music signals},
	Year = {2004},
}

@inproceedings{Brossier:2004,
	Author = {Brossier, P. M. and Bello, J. P. and Plumbley, M. D.},
	Booktitle = {Proc. Int. Symposium on Music Information Retrieval (ISMIR)},
	Keywords = {aubio; real time; onset detection},
	Pages = {331--336},
	Title = {Fast labelling of notes in music signals},
	Url = {http://ismir2004.ismir.net/proceedings/p060-page-331-paper242.pdf},
	Year = {2004},
	Abstract = {We present a new system for the estimation of note attributes from a live monophonic music source, within a short time delay and without any previous knowledge of the signal. The labelling is based on the temporal segmentation and the successive estimation of the fundamental frequency of the current note object. The setup, implemented around a small C library, is directed at the robust note segmentation of a variety of audio signals. A system for evaluation of performances is also presented. The further extension to polyphonic signals is considered, as well as design concerns such as portability and integration in other software environments.

},
}

@article{Bello:2004,
	Author = {Bello, J. P. and Duxbury, C. and Davies, M. and Sandler, M.},
	Isbn = {1070-9908},
	Ja = {Signal Processing Letters, IEEE},
	Journal = {Signal Processing Letters, IEEE},
	Keywords = {audio databases, audio signal processing, attack transients, complex frequency domain, energy, onset detection, phase, Attack transients, audio, complex domain, energy, music analysis, onset detection, phase},
	Number = {6},
	Pages = {553--556},
	Title = {On the use of phase and energy for musical onset detection in the complex domain},
	Ty = {JOUR},
	Url = {http://www.elec.qmul.ac.uk/people/juan/Documents/Bello-SPL-2004.pdf},
	Volume = {11},
	Year = {2004},
	Abstract = {We present a study on the combined use of energy and phase information for the detection of onsets in musical signals. The resulting method improves upon both energy-based and phase-based approaches. The detection function, generated from the analysis of the signal in the complex frequency domain is sharp at the position of onsets and smooth everywhere else. Results on a database of recordings show high detection rates for low rates of errors. The approach is more robust than its predecessors both theoretically and practically.},
}

@inproceedings{Philip-McLeod:2005,
	Author = {McLeod, P. and Wyvill, G.},
	Booktitle = {Proceedings of the International Computer Music Conference (ICMC'05)},
	Keywords = {tartini, pitch, pitch determination},
	Title = {A Smarter Way To Find Pitch},
	Year = {2005},
	Abstract = {The 'Tartini' project at the University of Otago aims to use the computer as a practical tool for singers and instrumentalists. Sound played into the system is analysed fast enough to create useful feedback for teaching or, at a higher level, for practising musicians to refine their technique. Central to this analysis is the accurate determination of musical pitch. We describe a fast, accurate and robust method for finding the continuous pitch in monophonic musical sounds. We employ a special normalised version of the Squared Difference Function (SDF) coupled with a peak picking algorithm. We show how to implement the algorithm efficiently. Inherent in our method is a 'clarity' estimate that measures to what extent the sound has a tone. This has already found application in showing defects in a violinist's bowing technique.},
}

@inproceedings{Philip-McLeod:2003,
	Author = {McLeod, P. and Wyvill, G.},
	Booktitle = {Proc. Computer Graphics International},
	Keywords = {user interface, pitch determination, visualisation, real time, Fourier transform, harmonics, tartini},
	Pages = {300--303},
	Title = {Visualization of Musical Pitch},
	Year = {2003},
	Abstract = {We have created software that shows a musician the pitch of the notes he or she is playing or singing, in real time and very accurately. This is useful as a teaching aid for beginners and also for studying refinements of sound production such as vibrato.},
}

@phdthesis{Kim:2003,
	Author = {Kim, Y. E.},
	School = {Massachusetts Institute of Technology},
	Title = {Singing Voice Analysis/Synthesis},
	Year = {2003},
	Abstract = {The singing voice is the oldest and most variable of musical instruments. By combin- 
ing music, lyrics, and expression, the voice is able to affect us in ways that no other 
instrument can. As listeners, we are innately drawn to the sound of the human voice, 
and when present it is almost always the focal point of a musical piece. But the acoustic 
flexibility of the voice in intimating words, shaping phrases, and conveying emotion 
also makes it the most difficult instrument to model computationally. Moreover, while 
all voices are capable of producing the common sounds necessary for language under- 
standing and communication, each voice possesses distinctive features independent of 
phonemes and words. These unique acoustic qualities are the result of a combination 
of innate physical factors and expressive characteristics of performance, reflecting an 
individual's vocal identity. 

A great deal of prior research has focused on speech recognition and speaker identi- 
fication, but relatively little work has been performed specifically on singing. There 
are significant differences between speech and singing in terms of both production and 
perception. Traditional computational models of speech have focused on the intelligi- 
bility of language, often sacrificing sound quality for model simplicity. Such models, 
however, are detrimental to the goal of singing, which relies on acoustic authenticity 
for the non-linguistic communication of expression and emotion. These differences 
between speech and singing dictate that a different and specialized representation is 
needed to capture the sound quality and musicality most valued in singing. 

This dissertation proposes an analysis/synthesis framework specifically for the singing 
voice that models the time-varying physical and expressive characteristics unique to an 
individual voice. The system operates by jointly estimating source-filter voice model 
parameters, representing vocal physiology, and modeling the dynamic behavior of these 
features over time to represent aspects of expression. This framework is demonstrated 
to be useful for several applications, such as singing voice coding, automatic singer 
identification, and voice transformation. },
}

@book{Oppenheim:1989,
	Address = {Upper Saddle River, NJ, USA},
	Author = {Oppenheim, Alan V. and Schafer, Ronald W.},
	Isbn = {0-13-216292-X},
	Publisher = {Prentice-Hall, Inc.},
	Title = {Discrete-time signal processing},
	Year = {1989},
}

@book{Lyons:1996,
	Address = {Boston, MA, USA},
	Author = {Lyons, Richard G.},
	Isbn = {0201634678},
	Publisher = {Addison-Wesley Longman Publishing Co., Inc.},
	Title = {Understanding Digital Signal Processing},
	Year = {1996},
}

@book{dafx,
	Address = {New York, NY, USA},
	Editor = {Zoelzer, U.},
	Isbn = {0471490784},
	Publisher = {John Wiley and Sons, Inc.},
	Title = {{DAFX}: Digital Audio Effects},
	Year = {2002},
}

@article{Carballo:2000,
	Author = {Carballo, G. and Mendoza, E.},
	Journal = {Clinical Linguistics {\&} Phonetics},
	Pages = {587--601(15)},
	Title = {Acoustic characteristics of trill productions by groups of {S}panish children},
	Url = {http://www.ingentaconnect.com/content/tandf/tclp/2000/00000014/00000008/art00002},
	Volume = {14},
	Year = {2000},
	Abstract = {Contrary to the English /r/, which has been extensively analysed, there is very little information on the trilled consonants in Spanish. These sounds are in general difficult to produce for young Spanish children and occur later (than other consonant sounds) in normal development. This paper describes acoustic measurements made on the trill productions in Spanish children with varying degrees of speech intelligibility. The spectral (frequency, amplitude, C-V ratio) and temporal characteristics (duration of the trill, number of open and closed periods{\&}#150;apertures and occlusions-and duration of these periods) were studied in 45 children (Granada, Spain) between 3.0 and 9.6 years old, divided into 5 groups. The results reveal differences in spectral and temporal acoustic correlates of trill /r/ among the five speaker groups (e.g., the duration of the first aperture period was longer for the articulatory disordered group than for the normal control group). They seem to indicate, in particular, that children who have trouble learning to make the trill sound do little more than tap the alveolar ridge. Moreover, these children seem to use more of their tongues to make these gestures; also, they do not make many trills or make them very quickly.},
}

@book{OppenheimSchaferBuck,
	Address = {Upper Saddle River, NJ, USA},
	Author = {Oppenheim, Alan V. and Schafer, Ronald W. and Buck, John R.},
	Isbn = {0-13-754920-2},
	Publisher = {Prentice-Hall, Inc.},
	Title = {Discrete-time signal processing (2nd ed.)},
	Year = {1999},
}

@inproceedings{Ozerov:2005,
	Author = {Ozerov, A., Philippe P. Gribonval R. and Bimbot, F.},
	Booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics},
	Doi = {10.1109/ASPAA.2005.1540176},
	Keywords = {filtering theory, maximum likelihood estimation, regression analysis, source separation},
	Month = {Oct},
	Pages = {90- 93},
	Title = {ONE MICROPHONE SINGING VOICE SEPARATION USING SOURCE-ADAPTED MODELS},
	Year = {2005},
	Abstract = {In this paper, the problem of one microphone source separation applied to singing voice extraction is studied. A probabilistic approach based on Gaussian mixture models (GMM) of the short time spectra of two sources is used. The question of source model adaptation is investigated in order to improve separation quality. A new adaptation method consisting in a filter adaptation technique via the maximum likelihood linear regression (MLLR) is presented with an associated filter-adapted training phase.},
}

@article{ASP20041036,
	Author = {Glass, Alexis and Fukudome, Kimitoshi},
	Doi = {10.1155/S1110865704402078},
	Journal = {EURASIP Journal on Applied Signal Processing},
	Keywords = {warped LPC, audio compression, structured audio, physical modelling, sound synthesis},
	Number = {7},
	Pages = {1036-1044},
	Title = {Warped Linear Prediction of Physical Model Excitations with Applications in Audio Compression and Instrument Synthesis},
	Volume = {2004},
	Year = {2004},
	Abstract = {A sound recording of a plucked string instrument is encoded and resynthesized using two stages of prediction. In the first stage of prediction, a simple physical model of a plucked string is estimated and the instrument excitation is obtained. The second stage of prediction compensates for the simplicity of the model in the first stage by encoding either the instrument excitation or the model error using warped linear prediction. These two methods of compensation are compared with each other, and to the case of single-stage warped linear prediction, adjustments are introduced, and their applications to instrument synthesis and MPEG4's audio compression within the structured audio format are discussed.},
}

@article{El-Jaroudi:1991,
	Author = {El-Jaroudi, A., Makhoul J.},
	Doi = {10.1109/78.80824},
	Journal = {Signal Processing, IEEE Transactions on [see also Acoustics, Speech, and Signal Processing, IEEE Transactions on]},
	Keywords = {convergence of numerical methods, iterative methods, poles and zeros, spectral analysis, speech analysis and processing, speech synthesis},
	Number = {2},
	Pages = {411--423},
	Title = {Discrete all-pole modeling},
	Volume = {39},
	Year = {1991},
	Abstract = {A method for parametric modeling and spectral envelopes when only a discrete set of spectral points is given is introduced. This method, called discrete all-pole (DAP) modeling, uses a discrete version of the Itakura-Saito distortion measure as its error criterion. One result is an autocorrelation matching condition that overcomes the limitations of linear prediction and produces better fitting spectral envelopes for spectra that are representable by a relatively small discrete set of values, such as in voiced speech. An iterative algorithm for DAP modeling that is shown to converge to a unique global minimum is presented. Results of applying DAP modeling to real and synthetic speech are also presented. DAP modeling is extended to allow frequency-dependent weighting of the error measure, so that spectral accuracy can be enhanced in certain frequency regions},
}

@article{Arroabarren:2004,
	Author = {Arroabarren, I. and Carlosena, A.},
	Doi = {10.1155/S1110865704401127},
	Journal = {EURASIP Journal on Applied Signal Processing},
	Keywords = {voice quality, source-filter model, inverse filtering, singing voice, vibrato, sinusoidal model},
	Number = {7},
	Pages = {1007-1020},
	Title = {Vibrato in Singing Voice: The Link between Source-Filter and Sinusoidal Models},
	Url = {http://www.hindawi.com/GetArticle.aspx?doi=10.1155/S1110865704401127&e=CTA},
	Volume = {2004},
	Year = {2004},
	Abstract = {The application of inverse filtering techniques for high-quality singing voice analysis/synthesis is discussed. In the context of source-filter models, inverse filtering provides a noninvasive method to extract the voice source, and thus to study voice quality. Although this approach is widely used in speech synthesis, this is not the case in singing voice. Several studies have proved that inverse filtering techniques fail in the case of singing voice, the reasons being unclear. In order to shed light on this problem, we will consider here an additional feature of singing voice, not present in speech: the vibrato. Vibrato has been traditionally studied by sinusoidal modeling. As an alternative, we will introduce here a novel noninteractive source filter model that incorporates the mechanisms of vibrato generation. This model will also allow the comparison of the results produced by inverse filtering techniques and by sinusoidal modeling, as they apply to singing voice and not to speech. In this way, the limitations of these conventional techniques, described in previous literature, will be explained. Both synthetic signals and singer recordings are used to validate and compare the techniques presented in the paper.},
}

@inproceedings{Li:2005,
	Author = {Li, Yipeng and Wang, DeLiang},
	Title = {DETECTING PITCH OF SINGING VOICE IN POLYPHONIC AUDIO},
	Year = {2005},
	Abstract = { We propose a robust algorithm to detect the pitch of singing voice in polyphonic audio. A new channel/peak selection scheme is introduced to exploit the salience of singing voice and the beating phenomenon in high frequency channels. An HMM model is employed to integrate the periodicity information across frequency channels and time frames. Quantitative evaluation shows that the new system performs signicantly better than existing algorithms for predominant pitch detection in polyphonic audio.},
}

@inproceedings{Grubb:1998,
	Author = {Grubb and Dannenberg},
	Booktitle = {Proceedings of the International Computer Music Conference},
	Organization = {International Computer Music Association},
	Pages = {37--44},
	Title = {Enhanced Vocal Performance Tracking Using Multiple Information Sources},
	Year = {1998},
}

@phdthesis{Griebel:2002,
	Address = {Austria},
	Author = {Griebel, H.},
	Keywords = {speech; music analysis; psychoacoustics; fundamental frequency; pitch; acoustics; formant; speech analysis; music analysis},
	Month = {September},
	School = {Vienna University of Technology},
	Title = {Time-frequency methods for pitch detection},
	Url = {http://www2.arcs.ac.at/dissdb/rn039350},
	Year = {2002},
	Abstract = {The thesis proposes new methods for the pitch detection of monophonic and polyphonic signals. Investigated have been speech and music signals with non-ideal real properties, with a little number of harmonics or stretched harmonics. Pitch detection means detecting the fundamental frequency of a harmonic complex sound, i.e. the sound consists of a fundamental tone and harmonics at integral multiples of the fundamental frequency. Additionally the detection of a single sinusoid is treated in general, with strong overlap from arbitrary other small-band components and with strong overlap from other stable sinusoidal components. Fundamental problem of polyphonic pitch detection is the overlapping of signal components. Estimation of frequency, amplitude and phase is no simple task anymore. Resolving overlapping determined signal components was neglected in the past and is main part of this thesis. The detection of individual sinusoidal components is subproblem of the fundamental frequency detection. Motivation for the thorough treatment is the problem of detecting voiced and unvoiced segments of a speech signal, which is more difficult than detecting the fundamental frequency and further, the application in automatic speech recognition. If the amplitudes of speech harmonics and the tone have the same order of magnitude, individual kernels of the front-end filterbank are disturbed and the error rate deteriorates. The proposed method uses two additional time-frequency planes, which represent the smoothness of the sinusoidal signal. It is possible to detect stationary sinusoidal signal even with strong overlap of partial tones of a speech signal. Polyphonic pitch detection is main part of an automatic music recognition system. Musicians could use such a system, analysis of musical expression and tune recognition are important applications. The evaluated iterative method identifies the most easily detectable sound and subtracts it from the overall spectrum. Both steps are repeated until no sound is detectable anymore. The sound is detected locally in bands and does not utilize partial tracks. The subtraction simplifies the spectrum in the sense, that overlaps are resolved and other sounds become detectable. Detection of the fundamental frequency of speech is economically the most important problem. With an accurate signal model many problems can be solved easier or can be solved at all. Applications are denoising or equalizing of speech, estimating syllables rates, speech recognition and speech detection. Despite the vast amount of research already done on the field current available methods are not reliable enough. The proposed method overcomes some of the shortcomings and gives more reliable results than other methods, especially all correlation based methods.},
}

@phdthesis{Every2006thesis,
	Address = {UK},
	Author = {Every, Mark},
	Month = {February},
	School = {University of York},
	Title = {Separation of Musical Sources and Structure from Single-Channel Polyphonic Recordings},
	Url = {http://www.ee.surrey.ac.uk/Personal/M.Every/EveryPhD06.pdf},
	Year = {2006},
	Abstract = {

       The thesis deals principally with the separation of pitched sources from single-channel polyphonic musical recordings. The aim is to extract from a mixture a set of pitched instruments or sources, where each source contains a set of similarly sounding events or notes, and each note is seen as comprising partial, transient and noise content. The work also has implications for separating non-pitched or percussive sounds from recordings, and in general, for unsupervised clustering of a list of detected audio events in a recording into a meaningful set of source classes. The alignment of a symbolic score/MIDI representation with the recording constitutes a pre-processing stage. The three main areas of contribution are: firstly, the design of harmonic tracking algorithms and spectral-filtering techniques for removing harmonics from the mixture, where particular attention has been paid to the case of harmonics which are overlapping in frequency. Secondly, some studies will be presented for separating transient attacks from recordings, both when they are distinguishable from and when they are overlapping in time with other transients. This section also includes a method which proposes that the behaviours of the harmonic and noise components of a note are partially correlated. This is used to share the noise component of a mixture of pitched notes between the interfering sources. Thirdly, unsupervised clustering has b