#R code for econometrics

  
#Correct the path to your location of the data
population <- read.csv("C:/Users/Hugo-Harari-Kermadec/Dropbox/enseignement/M2 EPOG econometrics/1bis basics/population.csv", sep=";")
population <- read.csv("C:/Users/harari/Desktop/Dropbox/enseignement/M2 EPOG econometrics/1bis basics/population.csv", sep=";")

library(tidyverse)

# "population" is a data.frame
#Gender = 1 if female
#real model : Income = 1100-150*Gender+40*Exp-1*(Exp-10)^2+error
#in excel ARRONDI(1100-150*Gender+40*Exp-1*(Exp-10)^2+LOI.NORMALE.INVERSE.N(ALEA();0;100);0)
head(population)
view(population)

#description of the population
N<-nrow(population);N # number of lines

histI<-hist(population$Income)
mu<-mean(population$Income);mu
var(population$Income)

q<-quantile(population$Income,c(.25,.75))

ggplot(population)+geom_histogram(aes(Income),color="black",fill="white",bins = 20)
  points(x=q,y=c(0,0),col='blue',pch='x', cex=8)



histG<-hist(population$Gender, plot=FALSE)
muG<-mean(population$Gender);muG
var(population$Gender)

#sampling an individual
index<-sample(1:N,1)
index
population[index,]


#sample_n of size n
n<- 2000
indexes<-sample(1:N,n,replace=T);indexes;
sample_n<-population[indexes,]
#head(sample_n)
mu_hat<-mean(sample_n$Income);mu_hat

#description of the sample
hist(population$Income,freq=F)
hist_sample<-hist(sample_n$Income,plot=FALSE)
plot(hist_sample, col=rgb(0,0,1,1/4),freq=F,add=T)
 
mu_hat<-mean(sample_n$Income);mu_hatn<- 50
indexes<-sample(1:N,n,replace=T);indexes;
sample_n<-population[indexes,]

histR_ech<-hist(sample_n$Income,plot=FALSE)

#Confidence Interval
mu_hat<-mean(sample_n$Income)# estimator of the poplation mean bu the sample mean
mu_hat
var_hat<-var(sample_n$Income)# estimator of the variance
plot(histI,col=rgb(1,0,0,1/4),freq=FALSE)
points(c(mu),c(0),col='red',pch='x', cex=2)
points(c(mu_hat),c(0),col='blue',pch='x', cex=2)

l_b<-mu_hat-sqrt(var_hat/n)*qt(.975, df=n-1);l_b # formula for the lower bound
u_b<-mu_hat+sqrt(var_hat/n)*qt(.975, df=n-1);u_b # formula for the upper bound
#direct function: qt(.975, df=n-1)
points(c(l_b,u_b),c(0,0),col='blue',pch='I', cex=2)

#Test
t.test(sample_n$Income, mu= 1700,conf.level = 0.95)
#the test is equivalent than checking if the tested value is in the Condidence Interval


#Testing that mean of income in a sample of men
#is equal to the mean income of women

population$Gender==1

males<-population[population$Gender==0,]
muM<-mean(males$Income);muM


females<-population[population$Gender==1,]
muF<-mean(females$Income);muF

head(females)

n<- 100
indexes<-sample(1:nrow(males),n,replace=T)
sample_n<-males[indexes,]

t.test(sample_n$Income, mu= muF)

#Expression of the t statistic
sqrt(nrow(sample_n))*(mean(sample_n$Income)-muF)/sqrt(var(sample_n$Income))
     

# Linear model
plot(sample_n$Experience,sample_n$Income)