#!/bin/bash # # parse-shrimp-data # # parses SHRIMP data from GSWA publications # presents it in a format for easier transcribing into the GSWA database # # Developed and tested on Linux # should work on FreeBSD / Mac OS X / Windows under cygwin # (as long as you have current versions of grep, awk, cut and bash installed) # # # RUNNING: #---------- # 1) Take the GSWA publication in word format, open it up in word - save it as PLAIN TEXT # 2) Make a directory including this script and the plain text file # 3) Edit the plain text file taking out the contents sections, only include the data # You may need to fix up the text file and make sure it is in unix readable format (getting rid of ^M dos characters) # a quick way to do this is to run dos2unix on the text, for example: # # bash2.05$ dos2unix shrimp-datafile.txt # # this should clean up the file of any ^M control characters, making it less error prone during parsing # # 4) At this point you should open up a bash shell, go to the directory with this data in it and run it: # # bash2.05$ cd /path/to/your/directory # bash2.05$ chmod +x parse-shrimp-data # bash2.05$ ./parse-shrimp-data shrimp-datafile.txt # # (where "shrimp-datafile.txt" is the name of your file # # after running, (it may take some time, depending how much data you have) you will end up with a file called: # "shrimp-datafile.txt.parsed" (or whatever the name of your file was, .parsed) # # at this point you can check for some errors (sortof) by running this: # # bash2.05$ grep Other shrimp-datafile.txt.parsed # # the numbers you get back you should check in your datafile, make sure the 'interpretation' area is contiguous, and # that there are no wierd newlines breaking up a proper sentence that this script looks for. Other than that, # you will probably have to do a few by hand at the end. # # 5) now you can open this file in an Excel spreadsheet, use plain text, tab delimited # You may have to check the "treat consecutive delimeters as one" box so it doesn't add an extra blank row between each. # # If you open up the current database, to match up this database with that one you will have to insert a row '1' # into this one (as the original has titles up there) # also when using all the GSWA publications, I had to do the following: # a) delete one of the 105007's, as GSWA doesn't have the monzonite sample present :( # b) add sample number 136819 in column 132 (insert new column) as this apparently isn't in any of the publications? # (either that or my script misses it, or I didn't do something right, oh well?) # # ASSUMPTIONS: # I assume that all GSWA numbers start with "1,2,8 or 9" and that some end in only A through C # I know this isn't the case forever, but still. #--------- # # Eric Thern ericNOSPAM(at) thern.org # http://www.thern.org # # v0.1 July 10, 2003 # v0.2 July 12, 2003 - updated some of the initial egrep statements for better accuracy # Also added some capitalization routines at the end. # # if test "$1" = '' then echo " USAGE: ./parse-shrimp-data This will create a .parsed file for you to open in excel This file is tab delineated " else # cat the file and pipe it into the rest of this # cat $1 | \ # # reverse grep for "Figure" and get rid of it (the lines that include 'figure' also include the sample number, and mess this all up) # egrep -v '(Figure)'| \ # # egrep for certain key phrases # "[0-9]....(.[A-C]:|.:|:|[A-C]:)" matches the sample number # the rest of the stuff here are to match key phrase beginnings to the interpretations of the samples # egrep -o '([0-9]....(.[A-C]:|.:|:|[A-C]:)|[Tt]he analyses do not form........*|[Aa]ge of igneous........*|[Tt]ime of (igneous|crystallization|deposition|emplacement|granite).....*|[Cc]rystallization age of........*|[Bb]est estimate of........*|[Ii]nterpreted (as the age of|as the crystallization|as the igneous|as the time of igneous|time|as provid|as a provid|as a max|as a min|as an estimate|as indicating|as corresponding|as m).....*|[Pp]rovide(s a| a|s the)......*|[Mm]inimum age of.....*|[Ii]mplaced at.....*|[Ii]ndicating (the|an interpreted).......*)' | \ # # cut out only the first sentence (anything after this is not useful) # cut -d"." -f1 | \ # # awk code to cut out commas, but that also includes important commas commonly used in conjunction # with key words, such as 'of,' and 'commonly,'. # this is fairly error prone, but it enables the gathering of good data while getting rid of the junk. # awk -F"." '{if ($1 ~ /(^| )of,([^a-z]|$)/ ) {FS="of,"; print $1 "of," $2;} \ else if ($1 ~ /(^| )commonly,([^a-z]|$)/ ) {FS="commonly,"; print $1 "commonly," $2;} \ else if ($1 ~ /(^| )at,([^a-z]|$)/ ) {FS="at,"; print $1 "at," $2;} \ else if ($1 ~ /(^| )to,([^a-z]|$)/ ) {FS="to,"; print $1 "to," $2;} \ else {FS=","; print $1;}}' | \ # # at [number] and of [number] are cut off the end (could use cut, but I like awk) # in this scenario, all at [number] and of [number] are useless to us # awk -F"at [0-9]" '{print $1}' | \ # # Turn on ignore case, set the output separator to "," and parse based on ":" being the field separator # Print out "Original Comment: " and then the $1 line with a tab # awk -F":" '{IGNORECASE=1; ORS=","; print "Original Comment: " $1 "\t"}' | \ # # the following statement is for "1" only, but use the same logic for all numbers shown here: # # Awk with field separator of ",Original Comment: 1" so we cut this up into proper bits # put it back together in order, making the first one different (as we don't need to add a 1 to it) # all the rest we append the "1" we took off when we used that as part of the field separator # awk -F",Original Comment: 8" '{print $1; for (n=2; n <= NF; n++) print "8"$n}' | \ awk -F",Original Comment: 9" '{print $1; for (n=2; n <= NF; n++) print "9"$n}' | \ awk -F",Original Comment: 1" '{print $1; for (n=2; n <= NF; n++) print "1"$n}' | \ awk -F",Original Comment: 2" '{print $1; for (n=2; n <= NF; n++) print "2"$n}' | \ # # look through all the ANALYSIS_COMMENTS fields and figure out what sort of ANALYSIS_EVENTDATED entry we need # this is seriously rough, but a best guess is better than nothing - # just make sure to look through the end data and make sure it is correct. # awk -F"," '{IGNORECASE=1; ORS=","; \ if ($2 ~ /(^| )minimum([^a-z]|$)/ ) {print $1 "," $2 ",Minimum age of igneous crystallization" "\n";} \ else if ($2 ~ /(^| )deposition([^a-z]|$)/ ) {print $1 "," $2 ",Maximum age for deposition" "\n";} \ else if ($2 ~ /(^| )estimate([^a-z]|$)/ ) {print $1 "," $2 ",Best estimate for igneous crystallization" "\n";} \ else if ($2 ~ /(^| )(precursor|protolith to the)([^a-z]|$)/ ) {print $1 "," $2 ",Igneous crystallization of precursor to gniess" "\n";} \ else if ($2 ~ /(^| )xenocryst([^a-z]|$)/ ) {print $1 "," $2 ",Xenocryst age = maximum age of igneous crystallization" "\n";} \ else if ($2 ~ /(^| )maximum([^a-z]|$)/ ) {print $1 "," $2 ",Maximum age for igneous crystallization" "\n";} \ else if ($2 ~ /(^| )metamorphism([^a-z]|$)/ ) {print $1 "," $2 ",Metamorphism" "\n";} \ else if ($2 ~ /(^| )(emplacement|emplaced|crystallization|recrystallization|extrusion age)([^a-z]|$)/ ) {print $1 "," $2 ",Igneous crystallization" "\n";} \ else {print $1 "," $2 ",Other" "\n";} \ }' | \ # # turn this all into tab delineated output and also clean up anything NOT starting with a GSWA number (we don't want those!) # awk -F"," '{if ($2 ~ /(^| )(1|2|8|9)[0-9]([^a-z]|$)/ ) {print $2 "\t" $3 "\t" $4;}}' | \ # # take the output from above and capitalize the first letter of each ANALYSIS_COMMENT, but after the # "Original Comment:" field, as that is already capitalized. # awk -F":" '{print $1 ": " toupper(substr($2,2,1)) substr($2,3)}' | \ # # sort the output and redirect it to this file # sort > $1.parsed # # ending up with a file that is Tab delimited # can open this up in excel now # fi