Friday, July 22, 2011

Parallel re-fetcher (using wget).

This script is a parallel re-fetcher (i.e. it can restart downloads removing the old files and updating them with a new ones). the comment says it all.





#! /bin/bash
rm -Rf /tmp/fetching_links
# Takes in a file, and a number which defines how many links to be fetched in parallel. This'll also utilize the wget feature to reuse an existing TCP connection.
# e.g. ./fetch_parallel <path to list> <no. of threads (n)>
# The file taken in will be spitted into n parts and each part will be stored in /tmp/fetching_links (with text file named 0, 1, 2, 3 etc... depending on value of n).
# Each file will be fed to wget for fetching.
# third argument is optional. Third, if present will will refetch the downloads after n seconds (as specified by the third argument) and overwrite the existing files. If the argument is 'c', it'll refetch without waiting.
# Files will be stored in ./downloads
# variable to store all links
links="$(cat "$1")"
# Generate list of links (depending on no. of parallel fetches).
# variable each will contain the no. of lines to exist in each instance of wget.
declare -i each
each=$(($(echo "$links" | wc --lines)/$2))
# Helper index variable $num
declare -i num
num=1
#assign link list for each wget instance to process and split the file list in arrays of $links.
while [[ $(($num - 1)) != $2 ]]
do
    links[$num]=$(echo "$links" | head -${each})
    #trim list of files to allow second run.
    #remaining lines (not divisible with each) will be in links[0]
    links="$(echo "$links" | tail -n +$(($each+1)))"
    num=num+1
done
#num will be used later as the no. of valid indexes in links. Thus reduce it's value to the right one.
num=num-1
mkdir /tmp/fetching_links
# index variable i will be used to echo lines in array links to /tmp/fetching_links, each will be read separately by wget.
declare -i i
i=0
# if links[0] = "" or \n, it means there were no left overs after the split of the link list. I need to make a mark of this.
    if [[ ${links[0]} == "" || ${links[0]} == "
" ]]
    then
# Set variable f as "empty" if links[0] was "". For future use.
        f="empty"
    fi
# num variable contains the no. of indexes available. For future use, it's copied over to another variable max.
declare -i max
max=$num
while [[ $num != -1 ]]
do
# File names will be same as the index variable in variable links
    echo "${links[$num]}" > /tmp/fetching_links/$num
    num=num-1
    if [[ $f == "empty" ]]
    then
        if [[ $num == 0 ]]
        then
            break
        fi
    fi
    done
# function which needs to be called again and again for repetitive fetching.
unset links
# Value of max will again be used, thus storing in num again.
declare -i num
num=$max
call_wget(){
# Resetting value of max in case of a recall.
    max=$num
# calling of wget starts here. Using previous variable max and 'f'
    while [[ $max  != -1 ]]
    do
        wget -N --tries=3 --timeout=5 -U "Mozilla/5.0 (X11; U; Gentoo Linux x86_64; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/11.0.696.71 Safari/534.16" -i /tmp/fetching_links/$max &> /dev/null &
        #debug
        max=max-1
# file[0] will be missing if f==empty
        if [[ $f == "empty" ]]
        then
            if [[ $max == 0 ]]
            then
                break
            fi
        fi
    done
}
t=$3
# If $3="", wget will just run once.
if [[ $t > 0 || $t == "c" ]]
then
    while [[ "1" != "2" ]] # An infinite loop
    do
        call_wget
        echo "Fresh call"
    # See if any wget's running in bg using jobs.
        status="running"
        while [[ "$status" == "running" ]]
        do
            jobs | grep Running &> /dev/null
            if [[ $? == 0 ]]
            then
                status="running"
                sleep 2
                echo "running"
            else
                status=""
            fi
        done
        # The sleep timer, or no wait, depending on value of $3.
        if [[ $3 != "c" ]]
        then
            echo "waiting $3 seconds"
            sleep $t
        fi
    done
else
    call_wget
    status="running"
        while [[ "$status" == "running" ]]
        do
            jobs | grep Running &> /dev/null
            if [[ $? == 0 ]]
            then
                status="running"
                sleep 2
                echo "running in background"
            else
                status=""
                echo "All done."
            fi
        done
fi
rm -Rf /tmp/fetching_links

No comments:

Post a Comment