/ Published in: Bash
Not sure why I'm posting this madness. Bash can be good for some things though. I think part of the reason I used bash was because I was calling alot of external commands.
Expand |
Embed | Plain Text
Copy this code and paste it in your HTML
#!/bin/sh # # scrape_yahoo.sh # # This script pulls data from yahoo using a NYSE index list generated # from scrape_nyse.sh. It iterates through the list, saving data from each # security in one big log and seperate per-security log files. This stuff # should go into a database, sooner or later. DEBUG=0 PATH=$PATH:/usr/local/bin BASEDIR=/home/stockh/stockharmony.com/api/scripts/ SYMBOLS_FILE=${BASEDIR}../data/nyse_index_symbols.txt LAST_SYMBOL=`tail -1 $SYMBOLS_FILE` # the SYMBOL string used in a request to yahoo SYMBOLS='' # took out "6t" (the URL) ARG_STRING=`tr -d '\n' < ${BASEDIR}../data/yahoo_arg_string_custom.txt ` YAHOO_URL='http://finance.yahoo.com/d/quotes.csv?s=' COUNT=0 # how many symbols should we query in one request? GET_SYMBOLS=15 SLEEP_TIME=60 BEGIN_TIME=`date +%Y%m%d%H%M%S | tr -d '\n'` DATA_FILE=${BASEDIR}../data/yahoo-finance-${BEGIN_TIME}.csv TMP_FILE=/tmp/yahoo-data LOGFILE=${BASEDIR}../logs/scraping SECS_DIR=${BASEDIR}../data/securities/ GET_YAHOO=0 SELF=`basename $0` PIDFILE=${BASEDIR}../logs/${SELF}.pid ERROR_STRING='default error string' # define functions first, put in include file later # send sms msg, only once send_sms_msg () { if [ "$1" ]; then STRING=$1; fi if [ $SMS_SENT ]; then return 0 else echo $STRING | mailx -s 'stockh error' 1234567789@cingularme.com SMS_SENT=1 fi } log_error () { if [ "$1" ]; then STRING="OOPS: return code $? because $1" else STRING="OOPS: return code $?" fi date >> $LOGFILE echo $STRING >> $LOGFILE } log_normal () { if [ "$1" ]; then STRING="OK: $1" else STRING="OK: seems ok $?" fi date >> $LOGFILE echo $STRING >> $LOGFILE } save_symbol_data () { # save once in main file cat $TMP_FILE >> $DATA_FILE 2>> $LOGFILE # grep for each symbol and save in seperate files for j in $SYMBOLS; do echo -n ${GOT_WHEN}, >> ${SECS_DIR}$j.csv match=",\"$j\"," grep -i $match $TMP_FILE >> ${SECS_DIR}$j.csv 2>> $LOGFILE done } # Look if any symbols have changed in TMP_FILE, get that data too, append to TMP_FILE. fetch_changed_symbol_data () { TMP=`grep '"Ticker symbol has changed to:' $TMP_FILE | sed 's/.*changed to: <a href="\/q?s=\(.*\)">.*/\1/'` TMP=`echo $TMP | tr -d '\n'` TMP=`echo $TMP | sed 's/^\s+//g'` NEW=$TMP if [ "$NEW" ]; then log_normal "got changed symbols: $NEW" # save new syms to global var $SYMBOLS SYMBOLS="${SYMBOLS} ${NEW}" TMP=`echo $NEW | tr ' ' '+'` URL="${YAHOO_URL}${TMP}&f=$ARG_STRING" # append data to tmp file lynx -dump $URL >> $TMP_FILE 2>> $LOGFILE # report errors if [ $? = 0 ]; then if [ $DEBUG ]; then log_normal "$URL" ; fi else log_error "lynx failed getting changed symbols $URL" send_sms_msg "lynx failed getting $NEW" fi fi } # take space seperate list of symbols, query yahoo and save to TMP_FILE fetch_and_save_symbol_data () { # replace space with + for URL TMP=`echo $SYMBOLS | tr ' ' '+'` URL="${YAHOO_URL}${TMP}&f=$ARG_STRING" # clobber TMP_FILE with new data lynx -dump $URL > $TMP_FILE 2>> $LOGFILE if [ $? = 0 ]; then if [ $DEBUG ]; then log_normal "$URL" ; fi GOT_WHEN=`date +%Y%m%d%H%M%S | tr -d '\n'` fetch_changed_symbol_data # regardless of fetch_changed_symbol_data always save symbol data at # this point save_symbol_data else log_error "lynx failed getting $URL" send_sms_msg "lynx failed on $TMP" fi } if [ -f $PIDFILE ]; then send_sms_msg "$SELF exiting, PID exists" log_error "$SELF exiting, PID exists" exit 1 fi echo $$ > $PIDFILE 2>> $LOGFILE # save $COUNT amount of symbols in $SYMBOLS then call functions for i in `cat $SYMBOLS_FILE`; do COUNT=`expr $COUNT + 1` if [ "$SYMBOLS" ]; then SYMBOLS="$SYMBOLS $i" else SYMBOLS=$i fi if [ $COUNT = $GET_SYMBOLS ]; then GET_YAHOO=1 elif [ $i = $LAST_SYMBOL ]; then GET_YAHOO=1 fi if [ $GET_YAHOO = 1 ]; then if [ $DEBUG ]; then log_normal "SYMBOLS are $SYMBOLS"; fi fetch_and_save_symbol_data sleep $SLEEP_TIME SYMBOLS='' COUNT=0 GET_YAHOO=0; fi done echo $SELF started at $BEGIN_TIME >> $LOGFILE 2>&1 echo $SELF finished on `date` >> $LOGFILE 2>&1 wc -l $DATA_FILE >> $LOGFILE 2>&1 wc -l $SYMBOLS_FILE >> $LOGFILE 2>&1 rm -f $PIDFILE >> $LOGFILE 2>&1 if [ $? = 0 ]; then exit 0 else send_sms_msg "could not remove PIDFILE. rm returned $?" exit 1 fi