diff --git a/.gitignore b/.gitignore index 4d9b4e6..8754317 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ */*pyc *pyc -*.mid \ No newline at end of file +*.mid +*.midi +*.wav +dist +MANIFEST +slides.md diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c16a390 --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ +slides: + printf '# ' > slides.md + head -n1 readme.md >> slides.md + echo 'Thomas Levine ([thomaslevine.com](http://thomaslevine.com)),
' >> slides.md + echo 'csv soundsystem ([csvsoundsystem.com](http://csvsoundsystem.com))' >> slides.md + echo >> slides.md + grep '^\(\[\?!\|##\)' readme.md | sed -e 's/^##/#/' -e 's/^/\n/' >> slides.md + reveal-md -t solarized -s '\n\n' slides.md diff --git a/ddpy.py b/ddpy.py deleted file mode 100644 index 10320e4..0000000 --- a/ddpy.py +++ /dev/null @@ -1,115 +0,0 @@ -import itertools - -from midiutil.MidiFile import MIDIFile - -def to_midi(table, filename, *args, **kwargs): - ''' - Args: - table: An iterable of dict-likes, a dict-like of iterables, - or a pandas.DataFrame-like. - filename: A string (like 'output/data.midi') - *args, **kwargs: passed along to df_to_midi - Returns: - None - - >>> to_midi([{'year':2010,'gdp':8},{'year':2011,'gdp':9}], 'gdp.midi', {}) - - >>> to_midi([[2010,8],[2011,9]], 'gdp.midi', {}) - Traceback (most recent call last): - ... - TypeError - ''' - - ''' - _check_types(table) - - if _is_like_dataframe(table): - dict_table = iter(table.to_dict()) - elif _is_iterable(table): - dict_table = iter(table) - elif _is_like_dict(table): - try: - dict_table = (dict(table.keys(), row) for row in itertools.izip(table.values())) - except: - raise TypeError - - for row in dict_table: - if not _is_like_dict(row): - raise TypeError - ''' - - m = df_to_midi(table, *args, **kwargs) - - binfile = open(filename, 'wb') - m.writeFile(binfile) - binfile.close() - -def from_midi(filename): - ''' - Args: - filename: A string (like 'input/data.midi') - Returns: - An iterable of dictionaries (table) and - a dictionary of musical parameters (music) - - >>> map(type, from_midi('gdp.midi')) - [, ] - ''' - return (({} for row in range(3)), {}) - -def _check_types(table): - ''' - Args: - table: An iterable of dict-likes, a dict-like of iterables, - or a pandas.DataFrame-like. - Returns: - None - Raises: - TypeError on invalid input - - >>> _check_types([{'year':2010,'gdp':8},{'year':2011,'gdp':9}]) - - >>> _check_types({2010:8,2011:9}) - Traceback (most recent call last): - ... - TypeError - ''' - if _is_like_dataframe(table): - pass - elif _is_like_dict(table): - are_iterable = map(_is_iterable, table.values()) - if not set(are_iterable) == {True}: - raise TypeError - elif _is_iterable(table): - pass - else: - raise TypeError - -def _is_like_dataframe(thing): - return hasattr(thing, 'to_dict') - -def _is_like_dict(thing): - return hasattr(thing, 'keys') - -def _is_iterable(thing): - return hasattr(thing, '__iter__') - - -def df_to_midi(df, bpm = 180): - ''' - Args: - table: A pandas.DataFrame - Returns: - A MIDI thingy - ''' - m = MIDIFile(df.shape[1]) - for col_number, col_name in enumerate(df.columns): - m.addTrackName(col_number,0,col_name) - m.addTempo(col_number,0,bpm) - for time,note in enumerate(df[col_name]): - m.addNote(col_number,0,note,time,1,100) - return m - -if __name__ == '__main__': - import doctest - doctest.testmod() diff --git a/ddpy/__init__.py b/ddpy/__init__.py new file mode 100644 index 0000000..2e75be4 --- /dev/null +++ b/ddpy/__init__.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# This file is part of ddpy. + +# Copyright (C) 2013 Brian Abelson, Thomas Levine and other contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies +# or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +# OR OTHER DEALINGS IN THE SOFTWARE. + +import itertools + +import numpy +from midiutil.MidiFile import MIDIFile + +__author__ = 'Brian Abelson (brianabelson.com) and Thomas Levine (thomaslevine.com)' +__version__ = '0.2.0' + +def to_midi(table, filename, *args, **kwargs): + ''' + Args: + table: A pandas.DataFrame + filename: A string (like 'output/data.midi') + *args, **kwargs: passed along to df_to_midi + Returns: + None + + >>> import pandas; to_midi(pandas.DataFrame([{'year':2010 - 2000,'gdp':8},{'year':2011 - 2000,'gdp':9}]), 'gdp.midi') + ''' + + m = df_to_midi(table, *args, **kwargs) + + binfile = open(filename, 'wb') + m.writeFile(binfile) + binfile.close() + +def from_midi(filename): + ''' + Args: + filename: A string (like 'input/data.midi') + Returns: + An iterable of dictionaries (table) and + a dictionary of musical parameters (music) + + >>> map(type, from_midi('gdp.midi')) + [, ] + ''' + raise NotImplementedError("We're not implementing this until we have a lossless to_midi function.") + return (({} for row in range(3)), {}) + +def df_to_midi(df, bpm = 180): + ''' + Args: + table: A pandas.DataFrame + Returns: + A MIDI thingy + ''' + m = MIDIFile(df.shape[1]) + for col_number, col_name in enumerate(df.columns): + m.addTrackName(col_number,0,col_name) + m.addTempo(col_number,0,bpm) + for time,note in enumerate(df[col_name]): + if numpy.isnan(note): + pass + elif note in range(128): + m.addNote(col_number,0,note,time,1,100) + else: + raise NotImplementedError('Only notes 0 to 127 and NaN are supported.') + return m + +if __name__ == '__main__': + import doctest + doctest.testmod() diff --git a/example.py b/ddpy/example.py similarity index 100% rename from example.py rename to ddpy/example.py diff --git a/test_df_to_midi.py b/ddpy/test_df_to_midi.py similarity index 79% rename from test_df_to_midi.py rename to ddpy/test_df_to_midi.py index 869c421..fd6a46b 100644 --- a/test_df_to_midi.py +++ b/ddpy/test_df_to_midi.py @@ -1,6 +1,8 @@ import nose.tools as n import pandas +import numpy from midiutil.MidiFile import MIDIFile + from ddpy import df_to_midi def assert_midi_equal(a, b): @@ -29,6 +31,27 @@ def test_one_int_column(): observed = df_to_midi(df, bpm = 120) assert_midi_equal(observed, expected) +def test_missing_value(): + ''' + A data frame with a single integer column + should be converted correctly. + ''' + expected = MIDIFile(1) + expected.addTrackName(0,0,"guitar") + expected.addTempo(0,0,120) + for time,note in enumerate([38, None, 42, 43]): + if note != None: + expected.addNote(0,0,note,time,1,100) + + df = pandas.DataFrame([ + {'guitar':38}, + {'guitar':numpy.nan}, + {'guitar':42}, + {'guitar':43}, + ]) + observed = df_to_midi(df, bpm = 120) + assert_midi_equal(observed, expected) + def test_two_int_columns(): ''' A data frame with a single integer column diff --git a/img/1028.png b/img/1028.png new file mode 100644 index 0000000..d54eac3 Binary files /dev/null and b/img/1028.png differ diff --git a/img/4l-FixedScale-NoMuProf2-preview.png b/img/4l-FixedScale-NoMuProf2-preview.png new file mode 100644 index 0000000..f63076c Binary files /dev/null and b/img/4l-FixedScale-NoMuProf2-preview.png differ diff --git a/img/4l-FixedScale-NoMuProf2.gif b/img/4l-FixedScale-NoMuProf2.gif new file mode 100644 index 0000000..f965b48 Binary files /dev/null and b/img/4l-FixedScale-NoMuProf2.gif differ diff --git a/img/720px-Chernoff_faces_for_evaluations_of_US_judges.svg.png b/img/720px-Chernoff_faces_for_evaluations_of_US_judges.svg.png new file mode 100644 index 0000000..f750d9e Binary files /dev/null and b/img/720px-Chernoff_faces_for_evaluations_of_US_judges.svg.png differ diff --git a/img/Chopin_Op._10_No._1_Godowsky's_first_version.jpg b/img/Chopin_Op._10_No._1_Godowsky's_first_version.jpg new file mode 100644 index 0000000..b57f8bb Binary files /dev/null and b/img/Chopin_Op._10_No._1_Godowsky's_first_version.jpg differ diff --git a/img/aplpack_problem.png b/img/aplpack_problem.png new file mode 100644 index 0000000..d2d46b5 Binary files /dev/null and b/img/aplpack_problem.png differ diff --git a/img/artichoke.jpg b/img/artichoke.jpg new file mode 100644 index 0000000..c4a84c0 Binary files /dev/null and b/img/artichoke.jpg differ diff --git a/img/borat.png b/img/borat.png new file mode 100644 index 0000000..db0034b Binary files /dev/null and b/img/borat.png differ diff --git a/img/christmas-screenshot.png b/img/christmas-screenshot.png new file mode 100644 index 0000000..1d52417 Binary files /dev/null and b/img/christmas-screenshot.png differ diff --git a/img/csv.png b/img/csv.png new file mode 100644 index 0000000..6ee1f91 Binary files /dev/null and b/img/csv.png differ diff --git a/img/dubstep-preview.png b/img/dubstep-preview.png new file mode 100644 index 0000000..9c6904f Binary files /dev/null and b/img/dubstep-preview.png differ diff --git a/img/fms-symphony-frame.png b/img/fms-symphony-frame.png new file mode 100644 index 0000000..0f55acf Binary files /dev/null and b/img/fms-symphony-frame.png differ diff --git a/img/gap-interpolated.jpg b/img/gap-interpolated.jpg new file mode 100644 index 0000000..d217ce8 Binary files /dev/null and b/img/gap-interpolated.jpg differ diff --git a/img/gap.jpg b/img/gap.jpg new file mode 100644 index 0000000..cb9ebad Binary files /dev/null and b/img/gap.jpg differ diff --git a/img/midi-piano.jpg b/img/midi-piano.jpg new file mode 100644 index 0000000..8375768 Binary files /dev/null and b/img/midi-piano.jpg differ diff --git a/img/minard.png b/img/minard.png new file mode 100644 index 0000000..82901ef Binary files /dev/null and b/img/minard.png differ diff --git a/img/open-doors.jpg b/img/open-doors.jpg new file mode 100644 index 0000000..4c3d7a1 Binary files /dev/null and b/img/open-doors.jpg differ diff --git a/img/sheet-music.jpg b/img/sheet-music.jpg new file mode 100644 index 0000000..b57f8bb Binary files /dev/null and b/img/sheet-music.jpg differ diff --git a/play.sh b/play.sh new file mode 100755 index 0000000..5b5db52 --- /dev/null +++ b/play.sh @@ -0,0 +1,3 @@ +#!/bin/sh +# Something is wrong with Tom's sound card, so he needed this weirdness. +timidity -Ow $1 && aplay --device=hw:1 $(echo "$1"|cut -d. -f1).wav diff --git a/readme.md b/readme.md index 4608214..9398230 100644 --- a/readme.md +++ b/readme.md @@ -2,9 +2,81 @@ Data music for big data analysis ===== ## Introduction -Visualization can only support so many variables. -In order to study high-dimensional datasets, -we need to leverage more senses, like the sense of sound. +[![For handle big data, solution is very simple: buy bigger monitor and use smaller font in the terminal.](img/borat.png)](https://twitter.com/mysqlborat/status/306078371182428161) + +Today, we produce more digital content than ever before. +Making sense of the data is becoming harder as the data +get more complex, so we need to develop new tools. +In particular, we need something more powerful than data visualization. + +### The problem with data visualization +Data visualization cannot support wide datasets, with lots +of variables. + +![Minard's map of Napoleon's march](img/minard.png) + + +This plot of Napoleon's march is seen as an exceptional +piece of data visualization that conveys the multivariate +relationships in the world, and it contains about seven +variables: Longitude, latitude, date, temperature, direction +branch, army size. (You could arrive at a slightly different +number if define the variables differently.) + +The best of visualizations only supports a few variables, +but today's data might contain thousands of variables. +Typically, we deal with this by reducing dimensions before plotting +or by making multiple plots, but these approaches lose information. +Rather than using these lossy visual approaches, we would like to +represent multivariate data in a more raw form. +How can we represent more raw variables? + +Chernoff faces can get us to about twenty variables. + +![Chernoff faces](img/720px-Chernoff_faces_for_evaluations_of_US_judges.svg.png) + + +Animations help a bit too. + +[![complicated plot](img/4l-FixedScale-NoMuProf2-preview.png)](img/4l-FixedScale-NoMuProf2.gif) + +But we're still only a little bit further. + +I think the future is in multisensory data experiences. Food is the prime +example of this, as we use all five senses in experiencing it. + +![Artichoke pizza](img/artichoke.jpg) + + +Unfortunately, computers don't have particularly good APIs for touch, +taste or smell, so it is harder to represent your data for these senses. +On the other hand, they do have good sound APIs, so we can start there. + +### Other reasons for music +As we explained above, music gives lets us use a different sense (sound) that visuals do. +If we combine music with visuals, we can consume data through multiple +senses and thus experience higher-dimensional data. This ability to +represent multivariate data is the main promise I see in data-driven +music, but I see some side benefits too. + +#### Accessibility +!["Opening Doors to IT" logo](img/open-doors.jpg) + + +#### Ambient perception +With traditional data visualizations, we need to focus +actively on the visualization. With sound, it is natural +to have a data sonification playing in the background +all of the time, like music. + +#### Reaching young people +Data is in. + +[![Government representatives](img/dubstep-preview.png)](http://www.youtube.com/watch?v=JwuEnyV1Cb0) ### History of ddpy [csv soundsystem](http://csvsoundsystem.com) makes @@ -19,17 +91,6 @@ The API for ddpy is inspired largely by a prototype we build for making music [from Google Spreadsheets](https://github.com/csv/sheetmusic). -### Why music -High dimensions - -thinking about multivariate analysis - -gastronomification - -culture - -accessibility - ### Today Today, you'll learn how to transform a dataset into music. We'll use the `ddpy` package for this tutorial, but the @@ -38,13 +99,23 @@ turn your data into music. ## Install You'll need a means of playing MIDI files. -[timidity++]() is one option. - +[timidity++](http://timidity.sourceforge.net/) is one option. +Or just use a website like [SolMiRe](http://solmire.com). -You'll also need [ddpy](). +You'll also need [ddpy](https://github.com/csv/ddpy). pip install ddpy +## Imports +The code examples below expect the following to have been run. + +```python +import numpy +import pandas +import pandas.io.wb +from ddpy import to_midi +``` + ## Tables I see the whole world as collections of things, which I like to represent as data tables. Rows @@ -57,9 +128,49 @@ collection of sounds. Thus, columns are instruments, rows are beats (or some other time-related thing), and cells contain notes. -![Ordinary sheet music]() - -![Music as a spreadsheet/table, with cells containing notes like "A4" and "C#3"]() +Here's a passage from Chopin's +[Étude Op. 10, No. 1](http://en.wikipedia.org/wiki/%C3%89tude_Op._10,_No._1_%28Chopin%29) +in ordinary sheet music. + +![Ordinary sheet music](img/Chopin_Op._10_No._1_Godowsky%27s_first_version.jpg) + + +And here it is as comma-separated values. (Well almost; +it doesn't include the two chords of dotted half notes.) + + + + left_hand, right_hand + NA, NA + C2, E6 + G2, C6 + C3, G5 + E3, C5 + C3, E5 + G3, C5 + C4, G4 + E4, C4 + C4, G4 + G4, C5 + C5, G5 + E5, C6 + C5, G5 + G4, C5 + C4, G5 + E4, C5 + C4, G4 + G3, C4 + C3, E4 + E3, C4 + C3, G3 + G2, E3 + C2, C3 + +Rather than composing music as traditional sheet music, +we can use a table-editing program of our choice to compose +this sort of table. ## Pandas to MIDI ddpy provides a `to_midi` function that converts @@ -67,10 +178,8 @@ a pandas data frame to a MIDI file. to_midi(df, 'output.mid') -It currently supports the following subset of data -frame possibilities. - -* ... +It currently only supports integer columns containing +integers from 0 to 127. Text is represented as lyric events, integers are represented as discrete beats, and floats are @@ -84,7 +193,7 @@ zero correspends to the lowest (left-most) key, and 127 corresponds to the right-most key. ddpy just passes these numbers from our data frame into the MIDI file. -![Piano with 128 keys, numbered from 0 to 127]() +![Piano with 128 keys, numbered from 0 to 127](img/midi-piano.jpg) Thus, we can compose some simple music by making columns with numbers from 0 to 127. Here's a chromatic scale. @@ -94,20 +203,19 @@ df = pandas.DataFrame({'chromatic':range(50, 63)}) to_midi(df, 'chromatic.mid') ``` -A major scale +A major (Ionian) scale ```python df = pandas.DataFrame({'major':[50, 52, 54, 55, 57, 59, 61, 62}) to_midi(df, 'major_scale.mid') ``` -A XXX minor scale +A natural minor (Aeolian) scale ```python -df = pandas.DataFrame({'minor':[]}) +df = pandas.DataFrame({'minor':[50, 52, 53, 55, 57, 58, 60, 62]}) to_midi(df, 'minor_scale.mid') ``` - Some minor chords (multiple instruments) ```python @@ -128,7 +236,6 @@ to_midi(df, 'normal.mid') df = pandas.DataFrame({'gamma':[round(random.gammavariate(2, 3)) for i in range(24)]}) to_midi(df, 'gamma.mid') ``` - You don't always need to play something; here's a Bernoulli rhythm. ```python @@ -139,7 +246,13 @@ to_midi(df, 'bernoulli.mid') ### Exercise Load a dataset into a pandas data frame, and convert it to MIDI. You can use any dataset you want, but here's an option in case you -can't come up with any. XXX +can't come up with any. + +```python +df = pandas.io.wb.download(indicator=['NY.GDP.PCAP.KD','EN.ATM.CO2E.KT'], + country=['US', 'CA', 'MX'], start=1900, end=2013) +``` + Don't worry about doing anything that complicated; we'll do that later. @@ -152,27 +265,23 @@ explain it. A MIDI file contains up to 128 different instruments (columns). Each of these contains up to 16 different tracks. -Within each track, we have a bunch of events, including - -* note -* ... - -There are also "meta-events", which include - -* a -* b +Within each track, we have a bunch of events, like +discrete notes and bends in pitch. There are also +"meta-events", which include lyrics. Why do we need this concept of events? We are using a MIDI file, but you can also emit MIDI events directly to other software, live. These live events use the same protocol as the events in our file. +ddpy currently only implements discrete notes. That +is, you can say that a note should be played at a +constant pitch for a certain period of time. + ## Preparing our data so the music sounds nice I've come up with a few elements in the production of interesting data music. -scaling - ### Data must have a noticeable pattern. Random music gets boring quickly. @@ -184,7 +293,11 @@ to_midi(df, 'random.mid') Similarly, empirical data that are effectively random aren't that interesting either. - Example +```python +df = pandas.io.wb.download(indicator='CM.MKT.INDX.ZG', + country='US', start=1900, end=2010) +to_midi(df, 'effectively_random.mid') +``` This gets more important as you add instruments the second instrument normally needs to have @@ -200,21 +313,28 @@ to_midi(df, 'two_random_instruments.mid') ``` ```python -# XXX add a real dataset -df = pandas.DataFrame() -to_midi(df, 'two_related_instruments.mid') -``` +data = pandas.io.wb.download(indicator=['NY.GDP.PCAP.KD','EN.ATM.CO2E.KT'], + country='US', start=1900, end=2010) -Periodic trends work particularly well. +def scale_for_midi(series, lowest = 0, highest = 127): + series_int = series.map(float).map(int) + return lowest + (highest - lowest) * (series_int - series_int.min()) / (series_int.max() - series_int.min()) -```python -# XXX add a real dataset -df = pandas.DataFrame() -to_midi(df, 'periodic_trends.mid') +music = pandas.DataFrame({ + 'gdp':scale_for_midi(data['NY.GDP.PCAP.KD'], lowest = 36, highest = 72), + 'co2':scale_for_midi(data['EN.ATM.CO2E.KT'], lowest = 36, highest = 72), +}) +to_midi(music, 'two_instruments.mid') ``` +Periodic trends work quite well. +Here's [transit ridership](http://thomaslevine.com/!/ridership-rachenitsa). + + + #### Exercise Make a simple song from two variables that are somehow related. +Again, use any dataset you want. ### You're still making music We started with the example of mapping numbers @@ -232,15 +352,24 @@ is major or minor. Then you create one column to convert to MIDI. ```python -# XXX add a real dataset +gdp_df = pandas.io.wb.download(indicator='NY.GDP.PCAP.KD',country='US', start=1900, end=2012) +gdp = list(reversed(gdp_df['NY.GDP.PCAP.KD'])) df = pandas.DataFrame({ - 'year':[], - 'prop_something': [], # scale the value to a reasonable range of base notes - 'better_than_last_year': [], #this becomes major or minor + 'gdp':gdp[1:], + 'better.than.last.year': gdp[1:] > gdp[:-1], +}) + +music = pandas.DataFrame({ + 'base.note':scale_for_midi(df['gdp'], lowest = 48, highest = 60), + 'better.than.last.year': df['better.than.last.year'], + 'downbeat':reduce(lambda a,b:a+b,[[12, numpy.nan, numpy.nan, numpy.nan] for i in range(df.shape[0])])[:df.shape[0]], + 'thirdbeat':reduce(lambda a,b:a+b,[[numpy.nan, numpy.nan, 24, numpy.nan] for i in range(df.shape[0])])[:df.shape[0]], }) -# Use different states from the ACS. Some interesting -# statistic means major/minor. -to_midi(df, 'periodic_trends.mid') +music['third'] = music['base.note'] + 4 +music[music['better.than.last.year']]['third'] = music[music['better.than.last.year']]['third'] + 1 +del music['better.than.last.year'] + +to_midi(music, 'change_in_gdp.mid') ``` Also, rows in your dataset could correspond to things @@ -248,24 +377,12 @@ other than beats, like a measure, a phrase, or a stanza. This is especially helpful when you're dealing with data of varied resolution (for example, monthly versus daily). -```python -df = pandas.DataFrame({ - 'year':[], - 'new_york':[], - 'new_jersey':[], - 'total':[], -}) -# Map total to aa lower something that varies less -# and the states to higher, melodic things. Each phrase -# includes all of the states, each state as a separate beat. -``` - #### Exercise Map some data onto musical aesthetics other than pitch. If you know any music theory, do get creative with this. -For something simple, you could try chords. To make a major -chord from a base note, play the following notes. +For something simple, you could try three-note chords (triads). +To make a major triad from a base note, play the following notes. * the base note * the base note plus four @@ -277,46 +394,6 @@ To make a minor chord, play the following notes. * the base note plus three * the base note plus seven -To make a seventh chord (XXX), play the ordinary major or minor -chord with a fourth note; the fourth note is the base note plus XXX - -### Gaps in data along your time variable are annoying -Your music can get boring if it doesn't change for very long. -This can happen if you have a particular sort of missing data. -Let's say that you have a dataset about locations of XXX -and you map the locations to the time variable. That might sound -like this. - -```python -``` - -If your instrument broke between locations 88 ft and 204 ft, -it'll sound like this. - -```python -``` - -That gap is inconvenient. If you are dealing with datasets like -this, you'll have to come up with some way of dealing with it. - -For inspiration, think about how we deal with this in graphs. -Sometimes, the gap occurs just once and we use a broken scale. - -XXX - -In some cases, it might make sense to interpolate the data and -indicate that we are doing so. - -XXX - -In other cases, the gap really just means that we should be -plotting our data on a different scale. - -XXX - -#### Exercise -No exercise for this, just something to think about - ### Outliers are your solos If you follow the advice above, you'll have a very coherent piece, where everything within in relates to @@ -324,17 +401,14 @@ everything else. This in itself gets boring, but it allows you to create interesting sequences that sharply contrast the rest of the piece. And these interesting sequences naturally arise if you have -outliers. - - Example +outliers. For example, check out the financial crisis +in [FMS Symphony](http://fms.csvsoundsystem.com/#777). This is actually the same for data visuals; people often focus quite strongly on outliers -in graphs. - - Equivalent graph example +in graphs. Here's a graph of the FMS Symphony data. - ![Equivalent graph]() +![A frame from the FMS Symphony video](img/fms-symphony-frame.png) Data music, just like data visuals, can be set up to emphasize specific parts of a dataset. That is, @@ -345,8 +419,51 @@ completely ignores it. Anyway, keep in mind that outliers make your music interesting. -#### Exercise -No exercise for this, just something to think about +## Thinking about sound and multivariate analysis +In this tutorial, we only just scratched the surface of how we can represent +data as music. I leave you with some thoughts on how to go further. + +Given that you're reading this, I suspect that you already know something +about how to make meaningful plots. We've been studying data visualization +for quite a while, so we've come up with some pretty good theory about how +to make good graphics. Our ears work differently from our eyes, so much of +this theory won't apply very directly. You'll have to explore different ways +of creating sound such that our ears perceive the data properly. + +### Multivariate analysis +Here's a little tip to get you thinking. The world is multivariate, and we +should represent that in our visuals. (As Edward Tufte would say, escape +Flatland.) When we are representing dozens of variables at once, we can't +expect ourselves to be able to keep track of all of the individual variables; +once we get to more than a few variables, we tend to reduce the dimensionality +based on some sort of unsupervised learning, like clustering or principal +component analysis. We use these multivariate methods to get a bigger picture; +once we have the bigger picture, we can choose to delve deeper into specific +parts of the dataset and to look at the original variables. + +### Why vision might not be great for multivariate analysis +When you're producing music, food, or visuals from data, it's good to both +present the bigger picture and allow people to delve deeper into specifics. +I find that the sense of vision is particularly well suited for delving into +specifics. This is because visuals can be static and because we can easily +block out certain parts of visuals. + +When I say that visuals can be static, I mean that a person can decide with +her eyes how long to spend looking at them. Contrast this to sound, where a +person has to spend time listening in order to perceive a full song. With a +visual, you can easily slow down to focus on just one part. + +When I say that we can block out certain parts of visuals, mean that we can +cover up parts of the visuals and just focus on the interesting part. +For example, we could have a huge scatterplot matrix but choose to focus on +only one of the scatterplots. Contrast this to sound and smell; with those +two senses, we can focus our perception by walking around or by pointing our +heads in different directions, but it's harder for us to focus on a particular +range of receptors (a band of frequencies or a set of smells). We can focus +our taste by choosing what we eat and to some degree by choosing which part +of our tongue we put our food on, but it's still not as much focus as we get +with vision. Touch is, perhaps, the closest sense to vision in the ability +to focus on particular stimuli. ## Review @@ -365,5 +482,23 @@ No exercise for this, just something to think about * You're still making music, so music theory applies. * Gaps along your musical time variable can be annoying. * Outliers are your solos. +* How to think about sound and multivariate analysis ## Other resources +* A talk about [Music videos in R](http://livestre.am/4pN67) +* [A blog post](http://thomaslevine.com/!/sensory-data-experiences/) about this stuff +* Tools + * [Data-driven rhythms](https://github.com/csv/ddr) + * [ddpy](https://github.com/csv/ddpy) + * [sheet music](http://csv.github.io/sheetmusic/) +* The Grammar of Graphics +* Any book by Edward Tufte + +Slides +----- +To see just the figures in the present document, run this. + +```sh +npm install -g reveal-md +make slides +``` diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8712b99 --- /dev/null +++ b/setup.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +# This file is part of ddpy. + +# Copyright (C) 2013 Brian Abelson, Thomas Levine and other contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies +# or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +# OR OTHER DEALINGS IN THE SOFTWARE. + +from distutils.core import setup +import ddpy + +setup(name='ddpy', + author=ddpy.__author__, + author_email='pypi@scraperwiki.com', + description='Compose music with pandas', + url='https://github.com/csv/ddpy.git', + classifiers=[ + 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 2.7', + ], + packages=['ddpy'], + + # From requests + version=ddpy.__version__, + license='MIT', + install_requires = [ + 'midiutil', + ], +)