Meta Comment : I have tried to have all my analysis as replicable as possible. if you are having trouble getting the same results anywhere or, feel like I did something wrong please let me know @arjunbazinga
I couldn't find a good source of data for all the things I wanted for the analysis in a data file format, so I ended up extracting the data from a summary report created every year. here. The downloaded file is also availble in the repo.
from tabula import read_pdf
import pandas as pd
population, birth_rate, infant_mortality = read_pdf(input_path="Demographic Indicators.pdf",multiple_tables=True, pages=[3,6,7],)
infant_mortality.head(10)
infant_mortality_clean = pd.DataFrame.copy(infant_mortality)
infant_mortality_clean = infant_mortality_clean[4:]
print(infant_mortality_clean.head())
infant_mortality_clean = infant_mortality_clean[[1,2,3]]
print(infant_mortality_clean.head())
infant_mortality_clean.reset_index(drop=True,inplace=True)
infant_mortality_clean.head()
temp = pd.DataFrame(infant_mortality_clean[2].str.split().values.tolist(), columns=["total","rural"])
infant_mortality_clean = pd.concat([infant_mortality_clean, temp], axis=1)
infant_mortality_clean.head()
infant_mortality_clean.drop(columns=2, inplace=True)
infant_mortality_clean.head()
infant_mortality_clean.rename(columns={1:"states", 3:"urban"}, inplace=True)
infant_mortality_clean.head()
infant_mortality_clean
infant_mortality_clean.isnull().values.any()
infant_mortality_clean.to_csv("infant_mortality.csv", index=False)
birth_rate.head(10)
birth_rate_clean = pd.DataFrame.copy(birth_rate)
birth_rate_clean = birth_rate_clean[5:]
birth_rate_clean.head()
x = birth_rate_clean[1].str.split()
states = []
total = []
rural = []
urban = []
for e in x:
urban.append(float(e[-4]))
rural.append(float(e[-5]))
total.append(float(e[-6]))
temp = e[0]
for i in e[1:-6]:
temp = temp + " " + i
states.append(temp)
df = pd.DataFrame(data={n:eval(n) for n in ["states", "total", "rural", "urban"]})
df.to_csv("birth_rate.csv")
population.head(10)
population_clean = pd.DataFrame.copy(population)
population_clean = population_clean[4:]
population_clean.head()
next step: taking the census data from 2011
population_clean = population_clean[[1,5,6]]
population_clean.reset_index(drop=True,inplace=True)
population_clean.head()
temp = pd.DataFrame(population_clean[6].str.split().values.tolist(), columns=["urban","total"])
population_clean = pd.concat([population_clean, temp], axis=1)
population_clean.head()
population_clean.drop(columns=6,inplace=True)
population_clean.head()
population_clean.rename(columns={5:"rural"},inplace=True)
population_clean["states"] = [a.strip("*") for a in population_clean[1].values]
population_clean.drop(columns=1, inplace=True)
population_clean
population_clean.to_csv("population.csv")