{
    "version":"0.2.0",
    "id":"multidetect-and-clean",
    "use_case":"BOKU",
    "title":"Run multidetect and clean the data",
    "description":"This process detects outliers using various methods and cleans the data: Ensemble multiple outlier detection methods to ably compare the outliers flagged by each method; then extract final clean data using either absolute or best method generated outliers. For more details, please ask BOKU.",
    "jobControlOptions":[
        "sync-execute",
        "async-execute"
    ],
    "keywords":[
        "pan-european use case",
        "AquaINFRA",
        "R"
    ],
    "links":[
        {
            "type":"text/html",
            "rel":"about",
            "title":"GitHub repo",
            "href":"https://github.com/AnthonyBasooma/specleanr",
            "hreflang":"en-US"
        },
        {
            "type":"text/html",
            "rel":"about",
            "title":"Vignette containing this example",
            "href":"https://github.com/AnthonyBasooma/specleanr/blob/master/vignettes/generaloutlier.Rmd",
            "hreflang":"en-US"
        },
        {
            "type":"application/json",
            "rel":"self",
            "href":"https://aquainfra.ogc.igb-berlin.de/pygeoapi/processes/multidetect-and-clean?f=json",
            "title":"Process description as JSON",
            "hreflang":"en-US"
        },
        {
            "type":"text/html",
            "rel":"alternate",
            "href":"https://aquainfra.ogc.igb-berlin.de/pygeoapi/processes/multidetect-and-clean?f=html",
            "title":"Process description as HTML",
            "hreflang":"en-US"
        },
        {
            "type":"text/html",
            "rel":"http://www.opengis.net/def/rel/ogc/1.0/job-list",
            "href":"https://aquainfra.ogc.igb-berlin.de/pygeoapi/jobs?f=html",
            "title":"Jobs list as HTML",
            "hreflang":"en-US"
        },
        {
            "type":"application/json",
            "rel":"http://www.opengis.net/def/rel/ogc/1.0/job-list",
            "href":"https://aquainfra.ogc.igb-berlin.de/pygeoapi/jobs?f=json",
            "title":"Jobs list as JSON",
            "hreflang":"en-US"
        },
        {
            "type":"application/json",
            "rel":"http://www.opengis.net/def/rel/ogc/1.0/execute",
            "href":"https://aquainfra.ogc.igb-berlin.de/pygeoapi/processes/multidetect-and-clean/execution?f=json",
            "title":"Execution for this process as JSON",
            "hreflang":"en-US"
        }
    ],
    "inputs":{
        "input_data":{
            "title":"Input table",
            "description":"URL to the input table containing the data to be cleaned from outliers: Data sets for multiple or single species from pred_extract and other sources.",
            "schema":{
                "type":"string"
            },
            "minOccurs":1,
            "maxOccurs":1,
            "keywords":[
                "table",
                "csv"
            ]
        },
        "colname_variable":{
            "title":"Variable of interest",
            "description":"Column name identifying the variable of interest where outliers will be checked from for univariate outlier detection methods such as Z-score, mixed interquantile range, reverse jackknifing",
            "schema":{
                "type":"string"
            },
            "minOccurs":1,
            "maxOccurs":1,
            "keywords":[
                "column"
            ]
        },
        "select_columns":{
            "title":"Specify columns to be checked",
            "description":"In a multivariate dataset, if only particular columns needs to be checked then they should be indicated here. Otherwise all columns will be considerd in outlier detection.",
            "schema":{
                "type":"string"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[]
        },
        "multiple_species":{
            "title":"Distinguish between multiple species",
            "description":"If NO, then only a single species dataset is expected. ",
            "schema":{
                "type":"boolean"
            },
            "minOccurs":1,
            "maxOccurs":1,
            "keywords":[]
        },
        "output_type":{
            "title":"Output type",
            "description":"Set whether you want to return outliers or clean dataset. Example: outlier.",
            "schema":{
                "type":"string"
            },
            "minOccurs":1,
            "maxOccurs":1,
            "keywords":[]
        },
        "group_colname":{
            "title":"Column name including group names",
            "description":"For multiple groups in a dataframe, provide the column name containing the groups to be checked. For example, a column name with species name in a dataset.",
            "schema":{
                "type":"string"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[]
        },
        "colname_exclude":{
            "title":"Column names to exclude",
            "description":"Exclude mainly numerical variables that are not necessary in the analysis, for example x and y columns or latitude/longitude, row numbers or serial IDs. Categorical variables are removed automatically in the data preparation.",
            "schema":{
                "type":"string"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[
                "column"
            ]
        },
        "methods":{
            "title":"Outlier detection methods",
            "description":"The name of methods for Outlier detection to be used, as a comma-separated string. Example = \"mixediqr, logboxplot, iqr, distboxplot, jknife, semiqr, hampel, iforest, lof, mahal\".",
            "schema":{
                "type":"string"
            },
            "minOccurs":1,
            "maxOccurs":1,
            "keywords":[
                "column"
            ]
        },
        "silence_true_errors":{
            "title":"Silence methods that geuninely fail during outlier detection",
            "description":"If YES, silence errors for methods that genuinely druing the outlier detection process but continue without breaking other methods.",
            "schema":{
                "type":"boolean"
            },
            "minOccurs":1,
            "maxOccurs":1,
            "keywords":[]
        },
        "boot_run":{
            "title":"Bootstrapping execution",
            "description":"If set to YES, then bootstrapping will be done for small samples.",
            "schema":{
                "type":"boolean"
            },
            "minOccurs":1,
            "maxOccurs":1,
            "keywords":[
                "column"
            ]
        },
        "boot_maxrecords":{
            "title":"Maximum records to intiate bootstrapping",
            "description":"The user can adjust the maximum records were to be be bootstrapped.",
            "schema":{
                "type":"integer"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[
                "column"
            ]
        },
        "number_of_boots":{
            "title":"Number of bootstraps",
            "description":"The number of bootstraps to generate during bootstrapping.",
            "schema":{
                "type":"integer"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[
                "column"
            ]
        },
        "setseed":{
            "title":"Set seed",
            "description":"During bootstrapping, random samples are generated that requires to set a seed for reproducibility.",
            "schema":{
                "type":"integer"
            },
            "minOccurs":1,
            "maxOccurs":1,
            "keywords":[
                "column"
            ]
        },
        "boot_threshold":{
            "title":"Threshold value to flag a record from bootstrap samples",
            "description":"As a record is flagged in multiple bootstrap samples, a threshold is required to extract an outlier. For instance, 0.6, meaning if a record is flagged 6 of the 10 bootsraps, will be flagged as an outlier.",
            "schema":{
                "type":"number"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[
                "column"
            ]
        },
        "exceute_pca":{
            "title":"Execute Principal Component Analysis",
            "description":"If true, then PCA will be intiated.",
            "schema":{
                "type":"boolean"
            },
            "minOccurs":1,
            "maxOccurs":1,
            "keywords":[]
        },
        "number_of_pca":{
            "title":"Number of principal components to retain",
            "description":"The user can indicate the maximum number of principal components to retain in the outlier detection.",
            "schema":{
                "type":"integer"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[]
        },
        "pca_silence":{
            "title":"Hide messages during PCA analysis",
            "description":"Messages during PCA analyis are returned if set to NO. YES means to silence the messages like the variance explained from the total PCs selected.",
            "schema":{
                "type":"boolean"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[]
        },
        "pcavariable":{
            "title":"Select the principal component among the PCs retained to be used as variable of interest",
            "description":"PC1 is the variable selected for outlier detection. Other PC2 can can also be set. PC1 is advisable.",
            "schema":{
                "type":"string"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[]
        },
        "sdm_data":{
            "title":"Change the outlier detection routine based on data type",
            "description":"If it is univariate data, then set to NO. All data that requires multivariate analysis such as using kmeans, isolation forest, set to YES.",
            "schema":{
                "type":"boolean"
            },
            "minOccurs":1,
            "maxOccurs":1,
            "keywords":[]
        },
        "inform_na_outlier":{
            "title":"Hide messages for removing NAs",
            "description":"If set to YES, NAs removed will be displayed for each group variable.",
            "schema":{
                "type":"boolean"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[]
        },
        "missingness":{
            "title":"Percentage missing values in a group",
            "description":"Allowed missing values in a column to allow a user decide whether to remove the individual columns or rows from the data sets. For instance, 0.1: If a column has more than 10 % missing values, then it will be removed from the dataset rather than the rows.",
            "schema":{
                "type":"number"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[]
        },
        "classify_or_autoremove":{
            "title":"Either use outlier classification or autoremoval with threshold or LOESS method",
            "description":"The parameter allows to switch from outlification that labels all records as perfect outlier to fair outliers to allows further scrutiny. Otherwise, the outliers will be dropped based on a threshold set naively or using LOESS method.",
            "schema":{
                "type":"boolean"
            },
            "minOccurs":1,
            "maxOccurs":1,
            "keywords":[]
        },
        "bool_loess":{
            "title":"Data extraction parameter: LOESS",
            "description":"If set to true then the local regression method for data extraction will be used",
            "schema":{
                "type":"boolean"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[]
        },
        "threshold_clean":{
            "title":"Data extraction parameter: Threshold setting",
            "description":"If Data extraction parameter: LOESS is NO, then a threshold value can be provided. The threshold will significantly determine which is flagged as an absolute outlier.",
            "schema":{
                "type":"number"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[]
        },
        "outlierweights_mode":{
            "title":"Data extraction parameter: Outlier weighting",
            "description":"Outlier weighting and selection methods. abs uses record proportional to identify an absolute outlier. Example: abs",
            "schema":{
                "type":"string"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[]
        },
        "classifymode":{
            "title":"Cuts data into classess",
            "description":"Categorize data base on the correlation coefficient manner based on Akoglu 2018. Example: med",
            "schema":{
                "type":"integer"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[]
        },
        "eif_bool":{
            "title":"Emprical Influence Function",
            "description":"Computes the Emprical Influence Function for classified outliers.",
            "schema":{
                "type":"boolean"
            },
            "minOccurs":0,
            "maxOccurs":1,
            "keywords":[
                "column"
            ]
        }
    },
    "outputs":{
        "cleaned_data":{
            "title":"Cleaned data",
            "description":"A table with data extracted from the reference dataset set during outlier detection.",
            "schema":{
                "type":"object",
                "contentMediaType":"application/json"
            }
        }
    },
    "example":{
        "inputs":{
            "input_data":"https://example.com/multiprecleaned.csv",
            "colname_variable":"Sepal.Length",
            "select_columns":null,
            "multiple_species":true,
            "output_type":"outlier",
            "group_colname":"Species",
            "colname_exclude":null,
            "methods":"mixediqr, logboxplot, iqr, distboxplot, jknife, semiqr, hampel",
            "silence_true_errors":false,
            "boot_run":false,
            "boot_maxrecords":30,
            "number_of_boots":5,
            "setseed":1125,
            "boot_threshold":0.6,
            "exceute_pca":true,
            "number_of_pca":2,
            "pca_silence":true,
            "pcavariable":"PC1",
            "sdm_data":true,
            "inform_na_outlier":true,
            "missingness":1.0,
            "bool_loess":true,
            "threshold_clean":null,
            "outlierweights_mode":"abs",
            "classifymode":"med",
            "eif_bool":false,
            "classify_or_autoremove":true
        }
    },
    "outputTransmission":[
        "value"
    ]
}