##############################################################################
# Copyright (c) Members of the EGEE Collaboration. 2007.
# See http://www.eu-egee.org/partners/ for details on the copyright
# holders.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##############################################################################
#
# NAME :        config_mpi_ce
#
# DESCRIPTION : This function configures the CE for MPI.
#               
#
# AUTHORS :     Stephen.Childs@cs.tcd.ie, John.Walsh@cs.tcd.ie,
#               enolfc@ifca.unican.es
#
# NOTES :       
#
# YAIM MODULE:  glite-yaim-mpi
#
##############################################################################

config_mpi_ce_check () {
    yaimlog DEBUG "This function doesn't currently require any variables."
}

config_mpi_ce_setenv () {
    yaimlog DEBUG "This function doesn't currently set any environment variables."
}

config_mpi_ce() {
    # These are the flavours of MPI currently supported. If you want to add
    # a new one just append it to this variable.
    MPI_FLAVOURS="MPICH MPICH2 LAM OPENMPI"

    SOME_MPI_IS_SELECTED="false"

    ####@ Add tags for MPI to Glue
    for FLAVOUR in ${MPI_FLAVOURS}; do

        eval FLAVOUR_ENABLED="\${MPI_${FLAVOUR}_ENABLE}"
        eval FLAVOUR_VERSION="\${MPI_${FLAVOUR}_VERSION}"

        if [ "x${FLAVOUR_ENABLED}" == "xyes" ]
        then
            SOME_MPI_IS_SELECTED="true"
            yaimlog DEBUG "Configuring ${FLAVOUR} (Version \"${FLAVOUR_VERSION}\")"
            # Beware of the line! 
            CE_RUNTIMEENV="${CE_RUNTIMEENV}    ${FLAVOUR}
"
            yaimlog DEBUG "Added $FLAVOUR to set to CE_RUNTIMEENV"

            if [ "x${FLAVOUR_VERSION}" == "x" ]
            then
                yaimlog WARNING "MPI_${FLAVOUR}_VERSION is not set. Cannot detect installed version, please define in site-info.def"
            else
                # Beware of the line! 
                CE_RUNTIMEENV="${CE_RUNTIMEENV}    ${FLAVOUR}-${FLAVOUR_VERSION}
"
                yaimlog DEBUG "Added ${FLAVOUR}-${FLAVOUR_VERSION} to CE_RUNTIMEENV"
            fi
        fi
    done
    #-----
    # Only advertise MPI functionality if at least ONE flavour
    # of MPI is being configured.
    #-----
    if [ "x$SOME_MPI_IS_SELECTED" = "xtrue" ]
    then
	    # Assuming that mpi-start will get installed
	    CE_RUNTIMEENV="${CE_RUNTIMEENV}    MPI-START
"
        
        # include version info in tag (See mpi-start #52)
        if [ "x${MPI_START_VERSION}" = "x" ] ; then 
            VERSION=`mpi-start -V 2> /dev/null`
            st=$?
            if [ $st -eq 0 ] ; then
                # clean version string
                VERSION=`echo $VERSION | sed -s 's/mpi-start//' | sed -s 's/[vV]//' | tr -d " "`
	            CE_RUNTIMEENV="${CE_RUNTIMEENV}    MPI-START-${VERSION}
"
                yaimlog DEBUG "Added MPI-START-$VERSION to CE_RUNTIMEENV"
            else
                yaimlog WARNING "Unable to determine mpi-start version, it will not be published"
            fi
        else
	        CE_RUNTIMEENV="${CE_RUNTIMEENV}    MPI-START-${MPI_START_VERSION}
"
            yaimlog DEBUG "Added MPI-START-$MPI_START_VERSION to CE_RUNTIMEENV"
        fi

        # This section is no longer applicable - the WMS will handle this corerctly
        #
	    # currently need to add MPICH even if MPICH not enabled
	    # this is because the only multi-node jobtype is MPICH
        #	if [ "xMPI_MPICH_ENABLE" != "xyes" ]; then
        #	    CE_RUNTIMEENV="${CE_RUNTIMEENV}    MPICH
        #"
        #	fi

        # If SHARED_HOME is defined and not set to "no" then publish tag
        if [ "x${MPI_SHARED_HOME}" != "x" -a "x$MPI_SHARED_HOME" != "xno" ]
        then
            # Beware of the line! 
            CE_RUNTIMEENV="${CE_RUNTIMEENV}    MPI_SHARED_HOME
"
	        yaimlog DEBUG "Added MPI_SHARED_HOME to CE_RUNTIMEENV"
        else
            CE_RUNTIMEENV="${CE_RUNTIMEENV}    MPI_NO_SHARED_HOME
"
	        yaimlog DEBUG "Added MPI_NO_SHARED_HOME to CE_RUNTIMEENV"

        fi
    else
        yaimlog INFO "No MPI flavours enabled."
    fi

    yaimlog DEBUG "CE_RUNTIMEENV is configured as: $CE_RUNTIMEENV"

    ####@ Configure Torque submit_filter to correctly
    ####@ handle CPU allocation.
    if [ "x${MPI_SUBMIT_FILTER}" == "xyes" ]
    then
        config_mpi_torque_submit_filter
        if [ "x${CONFIG_MAUI}" != "xyes" ]
        then 
            yaimlog WARNING "Ensure that your maui configuration includes ENABLEMULTIREQJOBS set to TRUE!"
        fi
    fi

    return 0
}

config_mpi_torque_submit_filter() {

    if [ -z "${TORQUE_VAR_DIR}" ]; then 
        yaimlog DEBUG "TORQUE_VAR_DIR not set" 
        if [ -d "/var/lib/torque" ]; then 
            yaimlog DEBUG "Using /var/lib/torque" 
            TORQUE_VAR_DIR="/var/lib/torque" 
        else 
            if [ -d "/var/torque" ]; then 
                yaimlog DEBUG "Using /var/torque" 
                TORQUE_VAR_DIR="/var/torque" 
            else 
                yaimlog ERROR "/var/torque nor /var/lib/torque are found!" 
                exit 1 
            fi
        fi
    else 
        yaimlog DEBUG "TORQUE_VAR_DIR is set to $TORQUE_VAR_DIR"
        TORQUE_VAR_DIR=${TORQUE_VAR_DIR}
    fi
  
    MPI_SUBMIT_FILTER=${TORQUE_VAR_DIR}/submit_filter

    cat << EOF >${MPI_SUBMIT_FILTER}
#!/usr/bin/perl

# This script read a submission script on the standard input, modifies
# it, and writes the modified script on standard output.  This script
# makes two modifications:
#
#   * correct the node specification to allow all cpus to be used
#   * adds a NOQUEUE flag if the job came in on the sdj queue
#
while (<STDIN>) {

    # By default just copy the line.
    \$line = \$_;

    # If there is a nodes line, then extract the value and adjust it
    # as necessary.  Only modify the simple nodes request.  If there
    # is a more complicated request assume that the user knows what
    # he/she is doing and leave it alone.
    if (m/#PBS\s+-l\s+nodes=(\d+)\s*\$/) {
        \$line = process_nodes(\$1);

        # If the line wasn't empty, then multiple CPUs have been 
        # requested.  Mark this as an MPI job.
        if (\$line ne '') {
          \$line .= "\n#PBS -A mpi\n";
        }
    }

    # If there is a queue option, check to see if it is "sdj".
    # If so, then add the option to not allow such jobs to be 
    # queued.
    if (m/#PBS\s+-q\s+sdj/) {
        \$line .= "#PBS -W x=\"FLAGS:NOQUEUE\"\n";
    }

    # If there is an existing accounts line, delete it.  The account 
    # should not be set to the DN, because an internal maui table is
    # filled which prevents standing reservations from being defined.
    if (m/#PBS\s+-A/) {
        \$line = '';
    }

    print \$line;
}

# This takes the number specified in the "nodes" specification and
# returns a "PBS -l" line which can be allocated on the available
# resources.  This essentially does per-cpu allocation.
sub process_nodes {
    my \$nodes = shift;
    my \$line = "";
    
    # If the requested number of nodes is 1, just return an empty string.
    if (\$nodes == 1) {
      return "";
    }

    # Collect information from the pbsnodes command on the number of
    # machine and cpus available.  Don't do anything with offline
    # nodes.
    open PBS, "pbsnodes -a |";
    my \$state = 1;
    my %machines;
    while (<PBS>) {
        if (m/^\s*state\s*=\s*(\w+)/) {
            \$state = (\$1 eq "offline") ? 0 : 1;
        # This may be changed to fit your nodes description
        } elsif (m/^\s*np\s*=\s*(\d+)/) {
        # } elsif (m/^\s*status\s*=\s*.*ncpus=(\d+),/) {
            my \$ncpus = \$1;
            if (\$state) {
                if (defined(\$machines{\$ncpus})) {
                    \$machines{\$ncpus} = \$machines{\$ncpus}+1;
                } else {
                    \$machines{\$ncpus} = 1;
                }
            }
        }
    }
    close PBS;

    # Count the total number of machines and cpus.
    my \$tnodes = 0;
    my \$tcpus = 0;
    my \$maxcpu = 0;
    foreach my \$ncpus (sort num_ascending keys %machines) {
        \$tnodes += \$machines{\$ncpus};
        \$tcpus += \$machines{\$ncpus}*\$ncpus;
        \$maxcpu = \$ncpus if (\$tcpus>=\$nodes);
    }

    if (\$maxcpu==0) {

        # There aren't enough cpus to handle the request.  Just pass
        # the request through and let the job fail.
        \$line .= "#PBS -l nodes=\$nodes\n";

    } else {

        \$line .="#PBS -l ";

        # We've already identified the largest machine we'll have to
        # allocate.  Start by allocating one of those and iterate until
        # all are used.
        my %allocated;
        my \$remaining_cpus = \$nodes;
        my \$remaining_nodes = \$tnodes;
        foreach my \$ncpus (sort num_descending keys %machines) {
            if (\$ncpus<=\$maxcpu && \$remaining_cpus>0) {
                my \$nmach = \$machines{\$ncpus};
                for (my \$i=0;
                     (\$i<\$nmach) && (\$remaining_cpus>\$remaining_nodes);
                     \$i++) {

                    \$remaining_cpus -= \$ncpus;
                    \$remaining_nodes -= 1;

                    # May only have to use part of a node.  Check here
                    # for that case.
                    my \$used = (\$remaining_cpus>=0)
                        ? \$ncpus
                        : \$ncpus+\$remaining_cpus;

                    # Increase the allocation.
                    if (defined(\$allocated{\$used})) {
                        \$allocated{\$used} += 1;
                    } else {
                        \$allocated{\$used} = 1;
                    }
                }

                # If we can fill out the rest without restricting the
                # number of cpus on a node, do so.
                if (\$remaining_cpus<=\$remaining_nodes &&
                    \$remaining_cpus>0) {

                    my \$used = 1;
                    if (defined(\$allocated{\$used})) {
                        \$allocated{\$used} += \$remaining_cpus;
                    } else {
                        \$allocated{\$used} = \$remaining_cpus;
                    }
                    \$remaining_cpus = 0;
                }
            }
        }

        my \$first = 1;
        foreach my \$i (sort num_descending keys %allocated) {
            \$line .= "+" unless \$first;
            \$line .= "nodes=" if \$first;
#           \$line .= "nodes=";
            \$line .= \$allocated{\$i};
#           \$line .= ":ppn=" . \$i unless (\$i == 1);
            \$line .= ":ppn=" . \$i;
            \$first = 0;
        }
        \$line .= "\n";
    }

    return \$line;
}


sub num_ascending { \$a <=> \$b; }


sub num_descending { \$b <=> \$a; }
EOF

    chmod 755 ${MPI_SUBMIT_FILTER}
    
    MPI_TORQUE_CFG="${TORQUE_VAR_DIR}/torque.cfg"

####@ Enable MPI torque submit filter in torque.cfg
    if [ -w ${MPI_TORQUE_CFG} ]
    then
        if [ "x`grep SUBMIT ${MPI_TORQUE_CFG}`" = "x" ]
        then
            echo "SUBMITFILTER $MPI_SUBMIT_FILTER" >> ${MPI_TORQUE_CFG}
        fi
    else
        echo "SUBMITFILTER $MPI_SUBMIT_FILTER" > ${MPI_TORQUE_CFG}
    fi

    return 0
}
