% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/generics.R, R/functions.R
\name{column_aggregate_functions}
\alias{column_aggregate_functions}
\alias{approx_count_distinct,Column-method}
\alias{approx_count_distinct}
\alias{approxCountDistinct,Column-method}
\alias{approxCountDistinct}
\alias{kurtosis,Column-method}
\alias{kurtosis}
\alias{max,Column-method}
\alias{max}
\alias{max_by,Column,Column-method}
\alias{max_by}
\alias{max_by,Column-method}
\alias{mean,Column-method}
\alias{mean}
\alias{min,Column-method}
\alias{min}
\alias{min_by,Column,Column-method}
\alias{min_by}
\alias{min_by,Column-method}
\alias{product,Column-method}
\alias{product}
\alias{percentile_approx,characterOrColumn,numericOrColumn-method}
\alias{percentile_approx}
\alias{percentile_approx,Column-method}
\alias{sd,Column-method}
\alias{sd}
\alias{skewness,Column-method}
\alias{skewness}
\alias{stddev,Column-method}
\alias{stddev}
\alias{std,Column-method}
\alias{std}
\alias{stddev_pop,Column-method}
\alias{stddev_pop}
\alias{stddev_samp,Column-method}
\alias{stddev_samp}
\alias{sum,Column-method}
\alias{sum}
\alias{sum_distinct,Column-method}
\alias{sum_distinct}
\alias{sumDistinct,Column-method}
\alias{sumDistinct}
\alias{var,Column-method}
\alias{var}
\alias{variance,Column-method}
\alias{variance}
\alias{var_pop,Column-method}
\alias{var_pop}
\alias{var_samp,Column-method}
\alias{var_samp}
\alias{count_distinct,Column-method}
\alias{count_distinct}
\alias{countDistinct,Column-method}
\alias{countDistinct}
\alias{n_distinct,Column-method}
\alias{n_distinct}
\alias{collect_list,Column-method}
\alias{collect_list}
\alias{collect_set,Column-method}
\alias{collect_set}
\alias{grouping_bit,Column-method}
\alias{grouping_bit}
\alias{grouping_id,Column-method}
\alias{grouping_id}
\title{Aggregate functions for Column operations}
\usage{
approx_count_distinct(x, ...)

approxCountDistinct(x, ...)

collect_list(x)

collect_set(x)

count_distinct(x, ...)

countDistinct(x, ...)

grouping_bit(x)

grouping_id(x, ...)

kurtosis(x)

max_by(x, y)

min_by(x, y)

n_distinct(x, ...)

percentile_approx(x, percentage, ...)

product(x)

sd(x, na.rm = FALSE)

skewness(x)

stddev(x)

std(x)

stddev_pop(x)

stddev_samp(x)

sum_distinct(x)

sumDistinct(x)

var(x, y = NULL, na.rm = FALSE, use)

variance(x)

var_pop(x)

var_samp(x)

\S4method{approx_count_distinct}{Column}(x, rsd = 0.05)

\S4method{approxCountDistinct}{Column}(x, rsd = 0.05)

\S4method{kurtosis}{Column}(x)

\S4method{max}{Column}(x)

\S4method{max_by}{Column,Column}(x, y)

\S4method{mean}{Column}(x)

\S4method{min}{Column}(x)

\S4method{min_by}{Column,Column}(x, y)

\S4method{product}{Column}(x)

\S4method{percentile_approx}{characterOrColumn,numericOrColumn}(x, percentage, accuracy = 10000)

\S4method{sd}{Column}(x)

\S4method{skewness}{Column}(x)

\S4method{stddev}{Column}(x)

\S4method{std}{Column}(x)

\S4method{stddev_pop}{Column}(x)

\S4method{stddev_samp}{Column}(x)

\S4method{sum}{Column}(x)

\S4method{sum_distinct}{Column}(x)

\S4method{sumDistinct}{Column}(x)

\S4method{var}{Column}(x)

\S4method{variance}{Column}(x)

\S4method{var_pop}{Column}(x)

\S4method{var_samp}{Column}(x)

\S4method{approx_count_distinct}{Column}(x, rsd = 0.05)

\S4method{approxCountDistinct}{Column}(x, rsd = 0.05)

\S4method{count_distinct}{Column}(x, ...)

\S4method{countDistinct}{Column}(x, ...)

\S4method{n_distinct}{Column}(x, ...)

\S4method{collect_list}{Column}(x)

\S4method{collect_set}{Column}(x)

\S4method{grouping_bit}{Column}(x)

\S4method{grouping_id}{Column}(x, ...)
}
\arguments{
\item{x}{Column to compute on.}

\item{...}{additional argument(s). For example, it could be used to pass additional Columns.}

\item{y, na.rm, use}{currently not used.}

\item{percentage}{Numeric percentage at which percentile should be computed
All values should be between 0 and 1.
If length equals to 1 resulting column is of type double,
otherwise, array type of double.}

\item{rsd}{maximum relative standard deviation allowed (default = 0.05).}

\item{accuracy}{A positive numeric literal (default: 10000) which
controls approximation accuracy at the cost of memory.
Higher value of accuracy yields better accuracy, 1.0/accuracy
is the relative error of the approximation.}
}
\description{
Aggregate functions defined for \code{Column}.
}
\details{
\code{approx_count_distinct}: Returns the approximate number of distinct items in a group.

\code{approxCountDistinct}: Returns the approximate number of distinct items in a group.

\code{kurtosis}: Returns the kurtosis of the values in a group.

\code{max}: Returns the maximum value of the expression in a group.

\code{max_by}: Returns the value associated with the maximum value of ord.

\code{mean}: Returns the average of the values in a group. Alias for \code{avg}.

\code{min}: Returns the minimum value of the expression in a group.

\code{min_by}: Returns the value associated with the minimum value of ord.

\code{product}: Returns the product of the values in a group.

\code{percentile_approx} Returns the approximate \code{percentile} of the numeric column
\code{col} which is the smallest value in the ordered \code{col} values (sorted from least to
greatest) such that no more than \code{percentage} of \code{col} values is less than the value
or equal to that value.

\code{sd}: Alias for \code{stddev_samp}.

\code{skewness}: Returns the skewness of the values in a group.

\code{stddev}: Alias for \code{std_dev}.

\code{std}: Alias for \code{stddev}.

\code{stddev_pop}: Returns the population standard deviation of the expression in a group.

\code{stddev_samp}: Returns the unbiased sample standard deviation of the expression in a group.

\code{sum}: Returns the sum of all values in the expression.

\code{sum_distinct}: Returns the sum of distinct values in the expression.

\code{sumDistinct}: Returns the sum of distinct values in the expression.

\code{var}: Alias for \code{var_samp}.

\code{var_pop}: Returns the population variance of the values in a group.

\code{var_samp}: Returns the unbiased variance of the values in a group.

\code{count_distinct}: Returns the number of distinct items in a group.

\code{countDistinct}: Returns the number of distinct items in a group.

An alias of \code{count_distinct}, and it is encouraged to use \code{count_distinct} directly.

\code{n_distinct}: Returns the number of distinct items in a group.

\code{collect_list}: Creates a list of objects with duplicates.
Note: the function is non-deterministic because the order of collected results depends
on the order of the rows which may be non-deterministic after a shuffle.

\code{collect_set}: Creates a list of objects with duplicate elements eliminated.
Note: the function is non-deterministic because the order of collected results depends
on the order of the rows which may be non-deterministic after a shuffle.

\code{grouping_bit}: Indicates whether a specified column in a GROUP BY list is aggregated or
not, returns 1 for aggregated or 0 for not aggregated in the result set. Same as \code{GROUPING}
in SQL and \code{grouping} function in Scala.

\code{grouping_id}: Returns the level of grouping.
Equals to \code{
grouping_bit(c1) * 2^(n - 1) + grouping_bit(c2) * 2^(n - 2)  + ... + grouping_bit(cn)
}.
}
\note{
approx_count_distinct(Column) since 3.0.0

approxCountDistinct(Column) since 1.4.0

kurtosis since 1.6.0

max since 1.5.0

max_by since 3.3.0

mean since 1.5.0

min since 1.5.0

min_by since 3.3.0

product since 3.2.0

percentile_approx since 3.1.0

sd since 1.6.0

skewness since 1.6.0

stddev since 1.6.0

std since 3.5.0

stddev_pop since 1.6.0

stddev_samp since 1.6.0

sum since 1.5.0

sum_distinct since 3.2.0

sumDistinct since 1.4.0

var since 1.6.0

variance since 1.6.0

var_pop since 1.5.0

var_samp since 1.6.0

approx_count_distinct(Column, numeric) since 3.0.0

approxCountDistinct(Column, numeric) since 1.4.0

count_distinct since 3.2.0

countDistinct since 1.4.0

n_distinct since 1.4.0

collect_list since 2.3.0

collect_set since 2.3.0

grouping_bit since 2.3.0

grouping_id since 2.3.0
}
\examples{
\dontrun{
# Dataframe used throughout this doc
df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))}

\dontrun{
head(select(df, approx_count_distinct(df$gear)))
head(select(df, approx_count_distinct(df$gear, 0.02)))
head(select(df, count_distinct(df$gear, df$cyl)))
head(select(df, n_distinct(df$gear)))
head(distinct(select(df, "gear")))}

\dontrun{
head(select(df, mean(df$mpg), sd(df$mpg), skewness(df$mpg), kurtosis(df$mpg)))}

\dontrun{
df <- createDataFrame(
  list(list("Java", 2012, 20000), list("dotNET", 2012, 5000),
       list("dotNET", 2013, 48000), list("Java", 2013, 30000)),
  list("course", "year", "earnings")
)
tmp <- agg(groupBy(df, df$"course"), "max_by" = max_by(df$"year", df$"earnings"))
head(tmp)}

\dontrun{
head(select(df, avg(df$mpg), mean(df$mpg), sum(df$mpg), min(df$wt), max(df$qsec)))

# metrics by num of cylinders
tmp <- agg(groupBy(df, "cyl"), avg(df$mpg), avg(df$hp), avg(df$wt), avg(df$qsec))
head(orderBy(tmp, "cyl"))

# car with the max mpg
mpg_max <- as.numeric(collect(agg(df, max(df$mpg))))
head(where(df, df$mpg == mpg_max))}

\dontrun{
df <- createDataFrame(
  list(list("Java", 2012, 20000), list("dotNET", 2012, 5000),
       list("dotNET", 2013, 48000), list("Java", 2013, 30000)),
  list("course", "year", "earnings")
)
tmp <- agg(groupBy(df, df$"course"), "min_by" = min_by(df$"year", df$"earnings"))
head(tmp)}

\dontrun{
head(select(df, sd(df$mpg), stddev(df$mpg), stddev_pop(df$wt), stddev_samp(df$qsec)))}

\dontrun{
head(select(df, sum_distinct(df$gear)))
head(distinct(select(df, "gear")))}

\dontrun{
head(agg(df, var(df$mpg), variance(df$mpg), var_pop(df$mpg), var_samp(df$mpg)))}

\dontrun{
df2 = df[df$mpg > 20, ]
collect(select(df2, collect_list(df2$gear)))
collect(select(df2, collect_set(df2$gear)))}

\dontrun{
# With cube
agg(
  cube(df, "cyl", "gear", "am"),
  mean(df$mpg),
  grouping_bit(df$cyl), grouping_bit(df$gear), grouping_bit(df$am)
)

# With rollup
agg(
  rollup(df, "cyl", "gear", "am"),
  mean(df$mpg),
  grouping_bit(df$cyl), grouping_bit(df$gear), grouping_bit(df$am)
)}

\dontrun{
# With cube
agg(
  cube(df, "cyl", "gear", "am"),
  mean(df$mpg),
  grouping_id(df$cyl, df$gear, df$am)
)

# With rollup
agg(
  rollup(df, "cyl", "gear", "am"),
  mean(df$mpg),
  grouping_id(df$cyl, df$gear, df$am)
)}
}
\seealso{
Other aggregate functions: 
\code{\link{avg}()},
\code{\link{corr}()},
\code{\link{count}()},
\code{\link{cov}()},
\code{\link{first}()},
\code{\link{last}()}
}
\concept{aggregate functions}
