\documentclass{article}
\usepackage[margin=1in]{geometry}
\usepackage{hyperref}
\usepackage{amsmath,amsfonts,amssymb,amsthm,commath}
\usepackage{enumitem}
\usepackage{framed}
\usepackage{xspace}
\usepackage{microtype}
\usepackage[round]{natbib}
\usepackage{cleveref}
\usepackage[dvipsnames]{xcolor}
% following loops stolen from djhsu
\def\ddefloop#1{\ifx\ddefloop#1\else\ddef{#1}\expandafter\ddefloop\fi}
\def\ddef#1{\expandafter\def\csname bb#1\endcsname{\ensuremath{\mathbb{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZ\ddefloop
\def\ddef#1{\expandafter\def\csname c#1\endcsname{\ensuremath{\mathcal{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZ\ddefloop
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\def\SPAN{\textup{span}}
\def\tu{\textup{u}}
\def\R{\mathbb{R}}
\def\be{\mathbf{e}}
\newcommand{\ip}[2]{\left\langle #1, #2 \right \rangle}
\newtheorem{fact}{Fact}
\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\title{CSE 598 TEL - Homework 1}
\author{MJT}
\begin{document}
\maketitle
\textbf{Instructions.}
\begin{itemize}
\item
All rules on the webpage apply.
\item
You may work in groups of size at most two; put all the NetIDs clearly on the first page,
and submit through gradescope.
\item
Homework is due \textbf{Wednesday, October 5, at 11:00am}; no late homework accepted.
\item
Please consider using the provided {\LaTeX} file as a template
(apologies for the weird indentation),
or at least something vaguely visually similar, since gradescope tries to automatically locate
beginnings and ends of problems.
\end{itemize}
\begin{enumerate}
\clearpage
\item
\textbf{(Analysis I:} missing step from lecture on proof by \citet{nn_stone_weierstrass}\textbf{.)}
Provide a proof from the missing step in
\href{http://mjt.web.engr.illinois.edu/courses/f2016/mltheory/lec4.html}{lecture 4},
restated as follows.
\begin{quote}
Let $\sigma:\R\to[0,1]$ be given, and suppose it is \emph{sigmoidal},
meaning continuous,
monotone nondecreasing, and satisfying
\[
\lim_{z\to-\infty} \sigma(z) = 0,\qquad\qquad \lim_{z\to+\infty}\sigma(z) = 1.
\]
Given any any $g \in \cH_{\cos} = \cbr{ x\mapsto \cos(a^\top x + b) : a\in \R^d, b \in \R}$
and any $\epsilon > 0$,
there exists $f\in \SPAN(\cH_\sigma)$ with
\[
\|f-g\|_{\tu} = \sup\left\{|f(x)-g(x)| : x\in[0,1]^d\right\} \leq \epsilon.
\]
\end{quote}
\textbf{Easy mode / hint:}
feel free to instead prove this approximation result for the norm
$\|f-g\|_1 = \int_{[0,1]^d}|f(x) -g(x)|dx$
(which is less finicky and carries more intuition from lecture),
and moreover with $d=1$.
\bigskip
\textbf{Solution.}
\clearpage
\item
\textbf{(Analysis II:} a nuisance from the neural net approximation lectures\textbf{.)}
Recall that the lectures on approximation of continuous functions by 2- and 3-layer networks
did not include a nonlinearity on the final output.
This exercise will set the record straight: namely, prove the following, where
$\|f-g\|_1 = \int_{[0,1]^d} |f(x) - g(x)|dx$ as in lecture.
\begin{quote}
Suppose a function class $\cF$ is given so that for any
continuous $g:\R^d\to\R$ and any $\tau>0$, there exists $f\in \cF$ with $\|f-g\|_1 \leq \tau$.
Prove that for any sigmoidal $\sigma : \R\to[0,1]$ (as in the previous problem)
the function class $\cF_\sigma := \{ \sigma \circ f : f\in \cF\}$ can approximate
\emph{appropriately restricted}
continuous functions, meaning for any continuous $g:\R^d\to[0,1]$ and any $\epsilon > 0$,
there exists $f\in \cF_\sigma$ with $\|f-g\|_1 \leq \epsilon$.
\end{quote}
\textbf{Easy mode / hint:}
feel free to assume $\sigma$ is any combination of Lipschitz and bijective
(with continuous inverse, even),
and that $g:\R^d\to (0,1)$ rather than $g:\R^d\to[0,1]$, but please state these assumptions clearly.
\textbf{Hard mode:} \emph{don't} make those assumptions, but be sure to check that your proof
doesn't accidentally rely on them.
\bigskip
\textbf{Solution.}
\clearpage
\item
\textbf{(A negative result} for single node neural networks\textbf{.)}
Suppose $\sigma :\R \to [0,1]$ is sigmoidal as in the previous question.
This question will develop a negative result on the representation power of
single layer networks (in particular, networks with exactly 1 node).
This result makes sense from the perspective of the result presented in
class due to \citet{minsky_papert}; they, however, had some motivation in
vision tasks, whereas here the task will be even simpler, namely univariate.
With this in mind, construct an appropriate continuous function $g : \R\to [0,1]$
and use it to (constructively) prove the following:
\begin{quote}
There exists a continuous function $g:\R\to[0,1]$ and a real $c>0$ so that
\[
\inf_{f\in \cH_\sigma} \|f-g\|_1 \geq c
\]
where
$\cH_\sigma := \cbr{ x\mapsto \sigma(a^\top x + b) : a\in \R, b\in \R }$.
\end{quote}
\bigskip
\textbf{Solution.}
\clearpage
\item
\textbf{(Polynomial approximation} \citep{weierstrass_apx}\textbf{.)}
The goal of this problem is to prove the following version of the Weierstrass Approximation Theorem:
\begin{quote}
For every continuous $g:\R^d\to\R$ and scalar $\epsilon > 0$,
there exists a polynomial $f:\R^d\to\R$ so that
\[
\|f-g\|_{\tu} = \sup\{|f(x)-g(x)| : x\in[0,1]^d \} \leq \epsilon.
\]
\end{quote}
The proof will proceed in a few steps. First recall (and don't bother to prove)
the following analysis fact from lecture.
\begin{lemma}
\label{fact:uni_cont}
Given continuous $g:\R^d\to \R$ and scalar $\epsilon > 0$,
there exists $\delta >0$ so that every $x,x' \in [0,1]^d$ with $\|x-x'\|_\infty\leq \delta$
implies $|g(x) - g(x')| \leq \epsilon/2$,
and an $M<\infty$ so that $\sup\{ |g(x)| : x\in[0,1]^d \} \leq M$.
\end{lemma}
In the rest of the proof, fix a continuous $g$ as in the problem statement,
let $\delta >0$ and $M<\infty$ be given according to the preceding lemma.
The steps of the proof are as follows.
\begin{enumerate}
\item
Let $(X_1, \ldots, X_d)$ denote independent random variables, where $X_i$ has
binomial distribution $B(n,x_i)$ corresponding to $n$ flips of an $x_i$-biased coin.
Prove that there exists $n$ such that $n > 1/ \delta^3$ and
\[
\Pr\sbr{ \exists i \in \cbr{1,\ldots, d} \centerdot |X_i - nx_i | > n^{2/3} }
< \frac {\epsilon}{4M}.
\]
(See below\footnote{\textbf{Hint:} what are some useful general-purpose
probability inequalities?} for a hint.)
\item
Prove
\[
\sum_{i_1 =0}^n \sum_{i_2 = 0}^n \cdots \sum_{i_d = 0}^n
\prod_{j=1}^d \binom{n}{i_j} x_j^{i_j} (1-x_j)^{n-i_j}
= 1.
\]
\item
Recalling that $\be_i$ denotes the $i$th standard basis vector,
define the polynomial
\[
f(x)
:=
\sum_{i_1 =0}^n \sum_{i_2 = 0}^n \cdots \sum_{i_d = 0}^n
g\del{ \sum_{j=1}^d i_j \be_j / n }
\prod_{j=1}^d \binom{n}{i_j} x_j^{i_j} (1-x_j)^{n-i_j},
\]
whose form conveniently relates to the $B(n,x_i)$ above.
With this and the other parts of this question statement in mind,
prove $\|f-g\|_{\tu} \leq \epsilon$.
(See below\footnote{\textbf{Hint:} split the sum defining $f$ into two parts,
one part being handled by the \Cref{fact:uni_cont}, and the other by the
first part of the question\ldots and surely the middle part of the question
is useful too\ldots} for a hint.)
\end{enumerate}
\bigskip
\textbf{Solution.}
\clearpage
\item
\textbf{(Convexity calisthenics.)}
\begin{enumerate}
\item
Given any $p \in [1,\infty]$, define $f_p : \R^d\to\R$
as
\[
f_p(v) :=\begin{cases}
\frac 1 p \|v\|_p^p = \frac 1 p \sum_{i=1}^d |v_i|^p
&
\textup{when $p\in [1,\infty)$},
\\
\iota_{[-1,+1]^d}(v)
&
\textup{when $p= \infty$}.
\end{cases}
\]
Prove for any \emph{conjugate exponents} $p,q\geq 1$
(meaning $1/p+1/q = 1$ with convention $1/\infty = 0$)
that $f_p^* = f_q$.
(Note, we're \emph{proving} H\"older's inequality, so don't use it as a step of this proof.)
\item
Prove $f_p$ is convex for every $t\in[1,\infty]$.
\item
Under the notation (and making use of) the previous part,
show for any $u,v\in\R^d$ with $\|u\|_p = 1 = \|v\|_q$
that $|\ip{u}{v}| \leq 1$.
\item
Now use the previous part to prove the full version of H\"older's inequality (which
implies the Cauchy-Schwarz inequality):
given any $u,v\in \R^d$
and any conjugate exponents $p,q\geq 1$,
\[
|\ip{u}{v}| \leq \|u\|_p \|v\|_q.
\]
\item
\textbf{Optional:}
prove $(\|\cdot\|_p)_* = \|\cdot\|_q$, where $p,q\geq 1$ are conjugate exponents.
\item
Define $f(x) := x^\top Q x/2$, where $Q\in \R^{d\times d}$
a symmetric positive definite matrix.
Derive and rigorously prove an explicit form for $f^*$.
(\textbf{Hint:} try to guess the answer first.)
\item
Define $f(x) := x^\top Q x/2$ once again, but now $Q\in \R^{d\times d}$
is merely symmetric positive semi-definite.
Now derive and rigorously prove a new expression for $f^*$.
(\textbf{Hint}: there are many ways
here, but again try to guess
the answer, and in times of great need never forget your friend S-V-D.)
\end{enumerate}
\bigskip
\textbf{Solution.}
\clearpage
\item
\textbf{(Max of random variables; moment generating functions.)}
An important object in the study of random variables is the moment generating function (MGF), $M_X(t)$,
defined as $M_X(t) := \bbE(\exp(tX))$.
($M_X$ will in general fail to be finite for all $t \geq 0$,
but in this question it is finite for all $t\geq 0$.)
Given a family $(X_i,\ldots,X_d)$ of i.i.d. random variables drawn according to some distribution,
this question will investigate the behavior of the random variable
$Z := \|(X_1,\ldots,X_d)\|_\infty = \max_i |X_i|$.
\begin{enumerate}
\item
Prove the following inequality, which will be convenient in the remainder of the question:
for any $t > 0$,
\[
\bbE(Z) \leq
\frac 1 t
\ln\del{
d \bbE\del{\exp(tX_1) + \exp(-tX_1)}
}.
\]
\textbf{Note / hint:} consider waiting for the ``convexity bootcamp'' lectures.
\textbf{Hint:} start your proof from the rearranged left hand $\exp(t\bbE(Z))$.
\item
\textbf{Optional:}
Suppose $X_1$ distributed according to a Gumbel distribution with scale parameter $\sigma$,
whereby $\bbE(\exp(sX_1)) = \Gamma(1-s\sigma)$ for all $s\in \R$,
where $\Gamma$ denotes the gamma function.
Prove that
\[
\bbE(Z) \leq 2 \sigma \ln( 2d \sqrt{\pi}).
\]
(See below\footnote{The inequality from the first part holds for all
$t$\ldots can you find a particularly nice choice of $t$?} for a hint.)
\item
Prove that Gaussian distribution is \emph{subgaussian}:
in particular, if $X_1$ is Gaussian with mean 0 and variance $\sigma^2$,
then $\bbE(\exp(tX_1)) = \exp(t^2\sigma^2/2)$.
\item
Prove that if $X_1$ is subgaussian with parameter $\sigma$
(meaning $\bbE(\exp(tX_1)) \leq \exp(t^2 \sigma^2 /2)$, then
\[
\bbE(Z) \leq \sigma \sqrt{2\ln (2d) }.
\]
(Together with the preceding part, this implies the bound for $X_1$ a Gaussian with mean 0 and variance $\sigma^2$.)
\item
Was it necessary to assume $(X_1,\ldots,X_d)$ were i.i.d.?
Answer this question however you like.
\end{enumerate}
\bigskip
\textbf{Solution.}
\end{enumerate}
\clearpage
\bibliographystyle{plainnat}
\bibliography{hw1}
\end{document}