Search This Blog

Tuesday, December 6, 2011

ARGO data

I've been playing with ARGO data to get a sense of how well the RTOFS Global model is doing in predicting vertical temperature and salinity structure.  I eventually got a script running that would do a daily download of the current day's real-time data for use in my comparison analysis with the RTOFS Global model. Keep in mind that I plan on using this script while at sea over a limited bandwidth connection to the internet so I chose to run wget for each file instead of passing it a list of files with the -i and -base options.

The script starts by downloading the "directory file" listing all available files, it looks like this:

# Title : Profile directory file of the Argo Global Data Assembly Center
# Description : The directory file describes all individual profile files of the argo GDAC ftp site.
# Project : ARGO
# Format version : 2.0
# Date of update : 20111206174544
# FTP root number 1 : ftp://ftp.ifremer.fr/ifremer/argo/dac
# FTP root number 2 : ftp://usgodae.usgodae.org/pub/outgoing/argo/dac
# GDAC node : FNMOC
file,date,latitude,longitude,ocean,profiler_type,institution,date_update
aoml/13857/profiles/R13857_001.nc,19970729200300,0.267,-16.032,A,845,AO,20080918131927
aoml/13857/profiles/R13857_002.nc,19970809192112,0.072,-17.659,A,845,AO,20080918131929
aoml/13857/profiles/R13857_003.nc,19970820184544,0.543,-19.622,A,845,AO,20080918131931
aoml/13857/profiles/R13857_004.nc,19970831193905,1.256,-20.521,A,845,AO,20080918131933
aoml/13857/profiles/R13857_005.nc,19970911185807,0.720,-20.768,A,845,AO,20080918131934
aoml/13857/profiles/R13857_006.nc,19970922195701,1.756,-21.566,A,845,AO,20080918131936
aoml/13857/profiles/R13857_007.nc,19971003191549,2.595,-21.564,A,845,AO,20080918131938
aoml/13857/profiles/R13857_008.nc,19971014183934,1.761,-21.587,A,845,AO,20080918131940
aoml/13857/profiles/R13857_009.nc,19971025193234,1.804,-21.774,A,845,AO,20080918131941
aoml/13857/profiles/R13857_010.nc,19971105185142,1.642,-21.362,A,845,AO,20080918131943
aoml/13857/profiles/R13857_011.nc,19971116194909,1.708,-20.758,A,845,AO,20080918131945
aoml/13857/profiles/R13857_012.nc,19971127190705,2.048,-20.224,A,845,AO,20080918131947
aoml/13857/profiles/R13857_013.nc,19971208183912,2.087,-19.769,A,845,AO,20080918131948
aoml/13857/profiles/R13857_014.nc,19971219192355,2.674,-20.144,A,845,AO,20080918131950
aoml/13857/profiles/R13857_015.nc,19971230184421,2.890,-20.433,A,845,AO,20080918131952
aoml/13857/profiles/R13857_016.nc,19980110194140,2.818,-20.699,A,845,AO,20080918131954
aoml/13857/profiles/R13857_017.nc,19980121190033,2.940,-20.789,A,845,AO,20080918131956
aoml/13857/profiles/R13857_018.nc,19980201195831,3.224,-20.757,A,845,AO,20080918131957

I parse the directory file looking for a date match in the date/time field (2nd field). You could easily modify this to limit it to a specific lat/lon bounding box or any other criteria.

Here's the script:

#!/bin/bash

base_argo_url=ftp://usgodae.org/pub/outgoing/argo

# Download the profile index
time1=`stat -f "%m" ar_index_global_prof.txt.gz`
wget --timestamping $base_argo_url/ar_index_global_prof.txt.gz
time2=`stat -f "%m" ar_index_global_prof.txt.gz`

if [ $time1 -eq $time2 ]
then
        echo "Nothing to do...no changes since last run"
        exit
fi

# Get today's date
today=`date -u '+%Y%m%d'`
echo "today is" $today

mkdir $today

zcat ar_index_global_prof.txt.gz | awk -F, '{if (NR > 9 && substr($2,1,8) == '$today') print $1 }' > $today/todays_casts.txt

cd $today

num_files=`cat todays_casts.txt | wc -l`

if [ $num_files -eq 0 ]
then
        echo "Nothing to do...no files to download yet for" $today
    exit
fi

echo "Going to check" $num_files "files"

for f in `cat todays_casts.txt`; do
        echo "Doing file" $f

        if [ -e `basename $f` ]
        then
                # Skip files that have already been downloaded
                continue
        fi

        # Don't need time stamping here, we check locally for existence of the
        # .nc file so don't need to waste time requesting a listing from the FTP server
        wget $base_argo_url/dac/$f
done



What comes out of this is a directory for the current day (named yyyymmdd) with a set of netCDF files in it (.nc file extension). Each file represents a cast from a given instrument, for example 20111206/R1900847_089.nc.

I then use a python script to read the .nc files and turn them into OMG/UNB format so that I can run comparisons against casts from RTOFS Global.

#!/usr/bin/env python2.6

import glob
import netCDF4
import numpy as np
import math
import datetime as dt
import matplotlib.pyplot as plt
import os

do_plot = True

if do_plot:
    plt.figure()
    plt.subplot(1,2,1)
    plt.xlabel("Temperature, deg C")
    plt.ylabel("Pressure, dbar")
    plt.hold
    plt.subplot(1,2,2)
    plt.xlabel("Salinity, psu")
    plt.ylabel("Pressure, dbar")
    plt.hold

for name in glob.glob('*.nc'):
    file = netCDF4.Dataset(name)

    latitude = file.variables['LATITUDE'][0]
    longitude = file.variables['LONGITUDE'][0]

    if math.isnan(latitude) or math.isnan(longitude):
        print "    skipping NAN lat/lon"
        continue

    juld = file.variables['JULD'][0]

    # TODO: the reference date is stored in 'REFERENCE_DATE_TIME'
    refdate = dt.datetime(1950,1,1,0,0,0,0,tzinfo=None)
    castdate = refdate + dt.timedelta(days=juld)

    print name + " " + str(latitude) + " " + str(longitude) + " " + str(castdate)

    try:
        # Only deal with casts that have ALL the data we need
        p = file.variables['PRES'][0][:]
        t = file.variables['TEMP'][0][:]
        t_fill_value = file.variables['TEMP']._FillValue
        t_qc = file.variables['TEMP_QC'][0][:]
        s = file.variables['PSAL'][0][:]
        s_fill_value = file.variables['PSAL']._FillValue
        s_qc = file.variables['PSAL_QC'][0][:]
    except:
        continue


    # Replace masked data with NAN
    # This will fail if there is no masked data since netCDF4 returns
    # a regular numpy array if no masked data but returns a masked numpy array
    # if there is.
    try:
        t_mask = t.mask
        t[t_mask] = np.NAN
    except:
        pass

    try:
        s_mask = s.mask
        s[s_mask] = np.NAN
    except:
        pass

    try:
        p_mask = p.mask
        p[p_mask] = np.NAN
    except:
        pass

    # Now filter based on quality control flags (we want 1, 2 or 5)
    t_ind = (t_qc == '1') | (t_qc == '2') | (t_qc == '5')
    s_ind = (s_qc == '1') | (s_qc == '2') | (s_qc == '5')

    # We only want to consider valid concurrent observations of T and S
    pair_ind = t_ind & s_ind

    if do_plot:
        plt.subplot(1,2,1)
        plt.plot(t[pair_ind],-p[pair_ind]);
        plt.subplot(1,2,2)
        plt.plot(s[pair_ind],-p[pair_ind]);

    t_filt = t[pair_ind]
    s_filt = s[pair_ind]
    p_filt = p[pair_ind]
    num_samples = t_filt.size

    file.close

    if num_samples == 0:
        print "    Skipping " + name + " due to lack of data!"
        continue

if do_plot:
    plt.show()


Here's a plot of data from 2011-12-06 at 1:30PM, EST.





Here's a map showing the geographic distribution of the casts for this particular run (2011-12-06).


Still to do?  Read up more about the various QC procedures applied to ARGO data and try to automate detection of casts that will mess up my comparison analysis (large chunks of missing data, etc).  Here's a bit of light reading to get me started. 

A la prochaine...

Wednesday, November 2, 2011

Global RTOFS continued: vertical T/S profiles

I'm primarily interested in extracing depth profiles of temperature/salinity from the model, here's an example of how to do it.

import matplotlib.pyplot as plt
import netCDF4

mydate='20111101'
url_temp='http://nomads.ncep.noaa.gov:9090/dods/rtofs/rtofs_global'+mydate+'/rtofs_glo_3dz_forecast_daily_temp'
file = netCDF4.Dataset(url_temp)
lat  = file.variables['lat'][:]
lon  = file.variables['lon'][:]
data_temp = file.variables['temperature'][1,:,1000,2000]
depths=file.variables['lev'][:]
file.close
url_salt='http://nomads.ncep.noaa.gov:9090/dods/rtofs/rtofs_global'+mydate+'/rtofs_glo_3dz_forecast_daily_salt'
file = netCDF4.Dataset(url_salt)
data_salt = file.variables['salinity'][1,:,1000,2000]
plt.figure()
plt.plot(data_temp,-depths)
plt.xlabel("Potential Temperature (re 2000m)")plt.ylabel("Depth (m)")
plt.figure()
plt.plot(data_salt,-depths)
plt.xlabel("Salinity (psu)")
plt.ylabel("Depth (m)")

And that's it!  A few figures below to show the plots.


Friday, October 28, 2011

Global RTOFS

Sea surface temperature, 2011-10-27
Thanks to John Kelley for pointing this out to me.  There's a Global RTOFS ocean model running now with pretty easy access via python.  I spent last night trying to get it up and running on my MacBook Pro, here are a few notes.  Before diving into it, check out the website for Global RTOFS here.

Step 1: get some python modules

 # unpack the tarball
cd ~/installs_Mac/python_modules/
tar -zxvf netCDF4-0.9.7.tar.gz
cd netCDF4-0.9.7


# install some hdf5 libs
fink install hdf5-18

# install some netcdf4 libs, this installs hdf5 stuff also so I'm not sure the above step was needed?
fink install netcdf7

# I had to figure out where fink installed the shared libraries for netcdf4 (they're in /sw/opt/netcdf7/)

otool -L /sw/bin/ncdump
/sw/bin/ncdump:
/sw/opt/netcdf7/lib/libnetcdf.7.dylib (compatibility version 9.0.0, current version 9.1.0)
/sw/lib/libsz.2.dylib (compatibility version 3.0.0, current version 3.0.0)
/sw/lib/libhdf5_hl.7.dylib (compatibility version 8.0.0, current version 8.1.0)
/sw/lib/libhdf5.7.dylib (compatibility version 8.0.0, current version 8.1.0)
/usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 125.2.11)
/usr/lib/libz.1.dylib (compatibility version 1.0.0, current version 1.2.3)
/sw/lib/libcurl.4.dylib (compatibility version 7.0.0, current version 7.0.0)

# set the NETCDF4_DIR based on output from otool above
export NETCDF4_DIR=/sw/opt/netcdf7/

# okay, now ready tot build the netCDF4 for python bits
# Set some enviroenment variablesexport CFLAGS=-m32
python2.6 setup.py install --home=~

# Now do the same for the basemap module
cd ~/installs_Mac/python_modules/
tar -zxvf basemap-1.0.1.tar.gz
cd basemap-1.0.1

# Need to use the default 'python' instead of the version installed through fink...wish I knew why
PATH="/Library/Frameworks/Python.framework/Versions/2.6/bin:${PATH}"
export PATH

# can't do the usual CFLAGS=-m32 (which is usually required for building against python as installed
# by fink?
unset CFLAGS
export GEOS_DIR=/sw/opt/libgeos3.2.2/
python2.6 setup.py install --home=~



Step 2.  Plot some data

Now that all goodies are installed, I tried out the script from here (note that I had to change the date to 20111027, the date provided in the example didn't work for me).

Here's my version of the script, mine does a salinity plot as well.


#!/usr/bin/env python2.6

from mpl_toolkits.basemap import Basemap
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import netCDF4


mydate='20111027'

url_temp='http://nomads.ncep.noaa.gov:9090/dods/rtofs/rtofs_global'+mydate+'/rtofs_glo_3dz_forecast_daily_temp'
file = netCDF4.Dataset(url_temp)
lat  = file.variables['lat'][:]
lon  = file.variables['lon'][:]
data_temp      = file.variables['temperature'][1,1,:,:]
file.close()


m=Basemap(projection='mill',lat_ts=10, llcrnrlon=lon.min(),urcrnrlon=lon.max(), llcrnrlat=lat.min(),urcrnrlat=lat.max(), resolution='c')
Lon, Lat = meshgrid(lon,lat)
x, y = m(Lon,Lat)

plt.figure()
cs = m.pcolormesh(x,y,data_temp,shading='flat', cmap=plt.cm.jet)

m.drawcoastlines()
m.fillcontinents()
m.drawmapboundary()
m.drawparallels(np.arange(-90.,120.,30.), labels=[1,0,0,0])
m.drawmeridians(np.arange(-180.,180.,60.), labels=[0,0,0,1])

colorbar(cs)
plt.title('Example 1: RTOFS Global Temperature')
plt.show()

# Now do the salinity
url_salt='http://nomads.ncep.noaa.gov:9090/dods/rtofs/rtofs_global'+mydate+'/rtofs_glo_3dz_forecast_daily_salt'
file = netCDF4.Dataset(url_salt)
lat  = file.variables['lat'][:]
lon  = file.variables['lon'][:]
data_salt      = file.variables['salinity'][1,1,:,:]
file.close()

m=Basemap(projection='mill',lat_ts=10, llcrnrlon=lon.min(),urcrnrlon=lon.max(), llcrnrlat=lat.min(),urcrnrlat=lat.max(), resolution='c')
Lon, Lat = meshgrid(lon,lat)
x, y = m(Lon,Lat)

plt.figure()
cs = m.pcolormesh(x,y,data_salt,shading='flat', cmap=plt.cm.jet)

m.drawcoastlines()
m.fillcontinents()
m.drawmapboundary()
m.drawparallels(np.arange(-90.,120.,30.), labels=[1,0,0,0])
m.drawmeridians(np.arange(-180.,180.,60.), labels=[0,0,0,1])

colorbar(cs)
plt.title('Example 1: RTOFS Global Salinity')
plt.show()



Results?

Here's a few screen shots of the plots I managed to generate using my script.  Next up?  Extracting vertical profiles of temperature and salinity.


Tuesday, August 10, 2010

svp_tool WOD query tool

I spent some time coding on the weekend (which would be considered sad by many, but not by me) and built a simple push button GUI interface that allows the user to refine WOD queries by instrument and month.  If I wasn't limited by the X interface, I'd add more SQL type functionality and allow for queries on any field (year, cruise ID, etc).  More work in the future I guess.  Here's a screen shot of the button bar that allows the user to choose instrument type and month of observation, the top row is for the month, the bottom row is for instrument type.  The labels on the buttons switch to UPPERCASE when selected and back to lowercase when deselected.  The query is only done after a geographic selection in the map window.  I've shown an example of several hundred CTD casts collected in the months of July, August and September in the Northwest Passage in northern Canada, one of my favourite areas that I've ever worked in.


The CTD data is plotted in the main SVP Tool window above, with sound speed, temperature and salinity plotted versus depth (left to right, respectively).

More work?  Of course.  If I turn on the debug mode of the WOD query engine, I can see that it spends an awful lot of its time unzipping the raw source data for each WMO square that it visits during the query extraction.  I'm still looking to speed this up if i can, right now the code fires off a system('gunzip -c file > tempfile'); whenever it hits a zipped WOD file.  I'm curious if it would be faster to run through zlib instead?  Kurt had also suggested converting the WOD files to a binary format so that's another option that would guarantee to bear some fruit but I really do like the elegance of sticking with the raw WOD files (easy to update, no need to reconvert with each update).  More thinking needs to be done, obviously...  Suggestions are welcome.

Wednesday, August 4, 2010

Downloading WOD

You can download the WOD files from the NODC website through an online query (http://www.nodc.noaa.gov/OC5/SELECT/dbsearch/dbsearch.html), you can also download the entire database if you look hard enough for the links (http://www.nodc.noaa.gov/OC5/WOD09/data09geo.html).  I didn't feel like pointing and clicking myself to death so I wrote this script to download the entire thing.  I also added a line in the script to compile a summary database of the CTD and XBT data for use in svp_tool.

#!/bin/bash

count=0

mkdir -p {OSD,MBT,CTD,XBT,PFL,MRB,DRB,APB,UOR,GLD}/OBS
mkdir -p {OSD,MBT,CTD,XBT,PFL,MRB,DRB,APB,UOR,GLD}/STD

echo "Downloading the WOD database"

for quadrant in 1 3 5 7
do
    for latitude in 0 1 2 3 4 5 6 7 8
    do 
        for longitude in 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17
        do 
            let count++
            for type in OSD MBT CTD XBT PFL MRB DRB APB UOR GLD
            do 
                # Get the "observed" data
                file=$type\O$quadrant$latitude$longitude.gz
                wget http://data.nodc.noaa.gov/woa/WOD09/GEOGRAPHIC/$type/OBS/$file
                mv $file $type/OBS

                # Get the "standard level" data (observed is filtered and then
                # interpolated to standard depth levels)
                file=$type\S$quadrant$latitude$longitude.gz
                wget http://data.nodc.noaa.gov/woa/WOD09/GEOGRAPHIC/$type/STD/$file
                mv $file $type/STD
            done
        done
    done
done

echo got total $count

# Now add a tmp directory for use by the WOD library for unzipping database files
# during extraction
mkdir tmp

# Now build database of CTD and XBT data
echo "Building a CTD/XBT summary database"
wod2db3 -out wod.db3 CTD/OBS/CTDO*gz XBT/OBS/XBTO*gz
echo
echo
echo "Add WOD_PATH="`pwd` "as an environment variable"
echo "Add WOD_DATABASE_NAME="`pwd`"/"wod.db3 "as an environment variable"

exit

Tuesday, August 3, 2010

sqlite3 and the World Ocean Database, continued

I got things sped up quite a bit by directly querying the database file I had created (see my last post) and by returning the query results as an array of watercolumn structures instead of writing them to disk and then reading them in again in my svp_tool.

Here's the bit of code that deals with building the query:

int wod_extract_profiles (WOD_Reader * wod_reader)
{
    int rc;
    char *zErrMsg = 0;

    int i;
    int offset;

    char month_list[50];
    char instrument_list[50];
    char sql_string[500];

    memset(month_list,0,50);
    memset(instrument_list,0,50);
    memset(sql_string,0,500);

    // There must be easier ways to do this than the way I'm doing it
    month_list[0]='(';
    offset = 1;
    for (i=1;i<=12;i++)
    {
        if (!wod_reader->query_bounds.month[i]) continue;
        sprintf(&month_list[offset],"%d,",i);
        offset = strlen(month_list);
    }
    month_list[offset-1]=')';
    // There must be easier ways to do this than the way I'm doing it
    instrument_list[0]='(';
    offset = 1;
    for (i=1;i<=WOD_NUM_TYPES;i++)
    {
        if (!wod_reader->query_bounds.instrument[i]) continue;
        sprintf(&instrument_list[offset],"%d,",i);
        offset = strlen(instrument_list);
    }
    instrument_list[offset-1]=')';

    // Check some of the query bounds before we start
    if (wod_reader->query_bounds.longitude_west > 180) wod_reader->query_bounds.longitude_west -= 360;
    if (wod_reader->query_bounds.longitude_east > 180) wod_reader->query_bounds.longitude_east -= 360;

    // Form the query: select records based on geo/time bounds and then sort in such a way that
    // the hits return in WOD filename order to minimize jumping around between files
    sprintf(sql_string,"SELECT * FROM tbl1 WHERE latitude > %.3f AND latitude < %.3f AND longitude > %.3f AND longitude < %.3f AND month in %s AND instrument in %s ORDER BY instrument, observed, quadrant, wmo_lat, wmo_lon, year, month, day;",
        wod_reader->query_bounds.latitude_south, wod_reader->query_bounds.latitude_north,
        wod_reader->query_bounds.longitude_west, wod_reader->query_bounds.longitude_east,month_list,instrument_list);

    if (wod_reader->debug) printf("Doing SQL string %s\n",sql_string);

    printf("Doing WOD query...patience\n");

    // Do the query and deal with the results
    rc = sqlite3_exec(wod_reader->db, sql_string, wod_query_callback, wod_reader, &zErrMsg);
    if (rc) error("SQL error: %s",zErrMsg);

    return(0);
}


The callback function (wod_query_callback) in the sqlite3_exec call deals with the query results one at a time.  Here'swhat's going on in there:

static int wod_query_callback(void * reader_in, int argc, char **argv, char **azColName)
{
    int i;
    int iend;

    int got_new_file;

    WOD_Reader *wod_reader;

    char * inname = NULL;

    int file_offset, num_bytes;

    int instrument, observed, quadrant, wmo_lat, wmo_lon;

    char type_lowercase[4];
    char type_uppercase[4];

    memset(type_uppercase,0,4);
    memset(type_lowercase,0,4);

    wod_reader = (WOD_Reader *) reader_in;

    got_new_file = 0;

    // Parse the output record from the query
    for(i=0; i
    {
        if (wod_reader->debug) printf("%s = %s\n", azColName[i], argv[i] ? argv[i] : "NULL");

        if (!strcmp(azColName[i],"instrument"))
        {
            instrument = atoi(argv[i]);
            if (instrument != wod_reader->instrument) got_new_file++;
        }
        else if (!strcmp(azColName[i],"observed"))
        {
            observed = atoi(argv[i]);
            if (observed != wod_reader->observed) got_new_file++;
        }
        else if (!strcmp(azColName[i],"quadrant"))
        {
            quadrant = atoi(argv[i]);
            if (quadrant != wod_reader->quadrant) got_new_file++;
        }
        else if (!strcmp(azColName[i],"wmo_lat"))
        {
            wmo_lat = atoi(argv[i]);
            if (wmo_lat != wod_reader->wmo_lat) got_new_file++;
        }
        else if (!strcmp(azColName[i],"wmo_lon"))
        {
            wmo_lon = atoi(argv[i]);
            if (wmo_lon != wod_reader->wmo_lon) got_new_file++;
        }
        else if (!strcmp(azColName[i],"file_offset"))
        {
            file_offset = atoi(argv[i]);
        }
        else if (!strcmp(azColName[i],"num_bytes"))
        {
            num_bytes = atoi(argv[i]);
        }
    }

    if (got_new_file)
    {
        if (wod_reader->debug) printf("Need to load a new WOD file!!\n");

        // Got a new file, so close the old one...
        wod_close_file(wod_reader);

        memcpy(type_uppercase,wod_data_types[instrument],3);
        type_lowercase[0] = tolower(type_uppercase[0]);
        type_lowercase[1] = tolower(type_uppercase[1]);
        type_lowercase[2] = tolower(type_uppercase[2]);

        // Build up the filename
        inname = calloc(strlen(wod_reader->wod_path)+50,1);
        memset(inname,0,strlen(wod_reader->wod_path)+50);

        // this the type of path that we're trying to construct: ctd/OBS/CTDO7807.gz
        sprintf(inname,"%s/%s/OBS/%sO%d%d%0.2d.gz",
        wod_reader->wod_path,type_lowercase,type_uppercase,quadrant,wmo_lat,wmo_lon);

        if (wod_open_file(wod_reader,inname))
        {
            sprintf(inname,"%s/%s/OBS/%sO%d%d%0.2d",wod_reader->wod_path,type_lowercase,type_uppercase,
                quadrant,wmo_lat,wmo_lon);
            if (wod_open_file(wod_reader,inname))
            {
                printf("Can't open file %s ... non-existant??\n",inname);
                free(inname);
                inname = NULL;
                return(1);
            }
        }

        if (inname) free(inname);
        inname = NULL;
    }

    if (wod_reader->debug) printf("%s: fseeking to %d for %d bytes\n",wod_reader->fname, file_offset,num_bytes);

    // Seek to the right spot in the file
    fseek(wod_reader->fp,file_offset,SEEK_SET);

    // Now read the profile at that location
    if ((iend = oclread (wod_reader)) == -1)
    {
        wod_close_file(wod_reader);
        return(1);
    }

    // Convert the cast to OMG SVP format, this updates the extracted_watercolumns array
    wod2svp(wod_reader);

    return 0;
}



Now I've just got to build a query building utility in the graphical end that lets the user select instrument, month and/or other fields.  Geographic bounds are currently selected by lasso in the map viewer and the month defaults to the current month.

I'm also planning on speeding up the oclread function (was delivered with the free NODC code that came with the WOD), right now it's reading things a single field at a time and I think it can be sped up a bit by reading the entire cast into memory and then parsing it from there.

jb

Friday, July 30, 2010

sqlite3 and the World Ocean Database

I'm working on getting the World Ocean Database (WOD) to be directly accessible by my svp_tool (sound velocity processing toolkit) and wanted to document some of the procedure I took.  Here's a map view of a query area near the Mariana Trench, the green squares show the locations of the 125 CTD casts in the WOD (for the month of August).


The resulting sound speed, temperature and salinity vertical profiles are shown below (left to right, all plotted versus depth on the vertical axis).


The WOD is a database of oceanographic data that records physical/biological/chemical oceanographic measurements, currently the 2009 database holds ~10 million sets of measurements. The data are organized spatially by WMO squares and by instrument type resulting in 12,960 files. I'm mainly interested in XBT and CTD measurements; the files that I need to tuck into and extract temperature and/or salinity profiles based on a user selected geographic region and range of months. The unzipped files are in an awkward ASCII format that take up about 6GB on the disk in gzip format. I'd like to make the lookup options as quick as possible so:

0) Wrap up the sample I/O C program as a C library (wod.{c,h}). The sample program that comes with the database has lots of global variables so I just wrapped it all up in a structure and then wrote some functions to initialize, open/close files, read data that neatly hide away the details and keep the programmer (me) from stepping on my own toes with all those globals and obscure function calls that need to be done just to read a single cast.

1) Build a database. The raw files contain header information (position, date, cruise number, etc) and then the measurements themselves. Having to unzip files just to search through to find casts that intersect the area of interest is slow and painful. I took the sample C program to read the WOD from their website and my new C library (wod.{c,h}) and morphed it into a program that writes out a sqlite3 database that contains only a small subset of the info in the header so that I can quickly query and find casts. The database also contains enough info to determine which of the 12,960 files an individual cast falls within and its file offset. Based on some advice from Kurt, I went with sqlite3 and googled my way through the problem. Here are a few juicy bits from the code, it was pretty simple in the end.

#include
int main()
{
....
// declaration
sqlite3 *db;
....
// open a database with (char * outname)
sqlite3_open(outname, &db);
....
// Create a table, will puke if the table has been added already
sqlite3_exec(db, "create table tbl1 (instrument integer, observed integer, quadrant integer, wmo_lat integer, wmo_lon integer, latitude real, longitude real, year integer, month integer, day integer, file_offset integer)", callback, 0, &zErrMsg);
.....
// Now insert some records
sprintf(sql_string,"INSERT INTO tbl1 (instrument, observed, quadrant, wmo_lat, wmo_lon, \
latitude, longitude, year, month, day, file_offset) VALUES (%d,%d,%d,%d,%d,%.3f,%.3f,%4d,%d,%d,%d)",
wod_reader.instrument,wod_reader.observed,wod_reader.quadrant,
wod_reader.wod_lat,wod_reader.wod_lon,
wod_reader.latitude,wod_reader.longitude,
wod_reader.year,wod_reader.month,wod_reader.day,file_offset);
rc = sqlite3_exec(db, sql_string, callback, 0, &zErrMsg);

// Build indices based on latitude and longitude as I expect to be querying against
// these most often
sqlite3_exec(db, "CREATE INDEX tbl1_idx_longitude ON tbl1(longitude)", callback, 0, &zErrMsg);
sqlite3_exec(db, "CREATE INDEX tbl1_idx_latitude ON tbl1(latitude)", callback, 0, &zErrMsg);

// Now close the database.
sqlite3_close(db);

...
}


Kurt had suggested using the SQLite Manager extension in Firefox to have a peek at what's in the database. I did so, and everything seemed okay.



Next steps?

- add sqlite3 querying ability into svp_tool
- add functionality to wod.{c,h} to read casts based on query results, right now it searches the entire WOD namespace and quickly rejects files that can't possibly intersect the spatial query bounds.  Those that do intersect are then unzipped and search sequentially to find the observations that fall within a specified area (and month).  ALL records are read through in their entirety even though MOST aren't desired.
- modify wod.{c,h} to deal with reading the .gz files directly through zlib instead of gunzipping and then gzipping the files for each extraction.