__author__ = "Narendra Boyina"
""" pandas Concepts
1. Introduction of Pandas
2. Creating Series
a. Creating a Series with integer values
b. Creating a Series with float values
c. Creating a Series of different data type values
d. Creating a Series by providing different labels for the values
3. Built-In_Functions
4. Aggregation Functions
5. DataFrame
a. Creating a DataFrame using a nested list (list of lists)
i. Accessing Single Column data
ii. Accessing Multiple Column data
b. Creating a DataFrame using a Dictionary
c. Create a Custom Row Index to the dataframe
6. Accessing required data from Series & DataFrame
a. Accessing data from series using Index (iloc)
b. Accessing range of data from series using Index (iloc)
c. Accessing data from series using label (loc)
d. Accessing range of data from series using label (loc)
c. Accessing data from DataFrame using Index (iloc)
d. Accessing range of data from DataFrame using Index (iloc)
e. Accessing data from DataFrame using label (loc)
f. Accessing range of data from DataFrame using label (loc)
6. Handling Missing Data
Apply below methods for series & DataFrame
isnull(), notnull(), fillna(value), dropna() --> for series & DataFrame
7. Indexing, Slicing, and Filtering
8. Merging dataframe
9. Importing DataSet
"""
"""
################# Introduction of pandas ###################
--> Pandas is a powerful Python library for data manipulation and analysis.
--> Pandas provides easy-to-use data structures and functions to work with structured data like tabular, time series, or matrix data
--> Pandas has Inbuilt-functions for data analyzing, cleaning, and manipulating.
Pandas primarily provides two data structures:
1) Series data structure
2) DataFrame data structure
"""
"""
1. Series
A Series is a one-dimensional labeled array that can hold any datatype (integers, floats, strings, Python objects, etc.)
"""
################# Importing pandas & NumPy modules ###################
import pandas as pd
import numpy as np
# print(pd.__version__) # to check pandas library version
# Creating a Series with integer values
series_of_int_values = pd.Series([1, 8, 3, 2, 5, 4, 6])
# print(series_of_int_values) # Each value in the Series is assigned an index (By default 0, 1, 2, ...) + dtype: int64
"""
output:
0 1
1 8
2 3
3 2
4 5
5 4
6 6
dtype: int64
"""
# Creating a Series with float values
series_of_float_values = pd.Series([4.72, 22.5, 73.2, 74.5])
# print(series_of_float_values) # Each value in the Series is assigned an index (By default 0, 1, 2, ...) + dtype: float64
"""
0 4.72
1 22.50
2 73.20
3 74.50
dtype: float64
"""
# Creating a Series of different data type values
series_of_diff_val = pd.Series(["hello", 472, 22.5, "good morning", 732, "hello", 74.5])
# print(series_of_diff_val) # Each value in the Series is assigned an index (By default 0, 1, 2, ...) + dtype: object
"""
0 hello
1 472
2 22.5
3 good morning
4 732
5 hello
6 74.5
dtype: object
"""
"""
We can also gives label the index:
Creating a Series by providing different labels for the values
"""
data = pd.Series([4.72, 22.5, 73.2, 74.5], index=['a', 'b', 'c','d'])
# print(data) # Each value in the Series is assigned a label + dtype: float64
data = pd.Series([4.72, 22.5, 73.2, 74.5], index=['I', 'II', 'III','IV']) # roman numbers
# print(data) # Each value in the Series is assigned a label + dtype: float64
################# Built-In_Functions of pandas ###################
""" describe(): Provides a quick summary of the data.
This method gives a statistical summary of the Series, including count,
mean, standard deviation, minimum, maximum, and quartile values."""
statistic_data = pd.Series([10, 3.24, 5, 7, 9.2, 17 ]) # Creating a Series
# Descriptive statistics
# print(statistic_data.describe())
"""mean(): Computes the mean of the data.
add all the numbers together and then divide the sum by the total number of values in the set. """
# Mean of the Series
# print(statistic_data.mean())
"""std(): Computes the standard deviation."""
# Standard deviation of the Series
# print(statistic_data.std())
"""min() and max(): Computes the minimum and maximum values."""
# Minimum and maximum values
# print(statistic_data.min())
# print(statistic_data.max())
################# Aggregation Functions ###################
"""sum(): Sums up the values"""
# Sum of the Series
# print(statistic_data.sum())
"""cumsum(): Cumulative sum is the running total of frequencies in a dataset, calculated by adding each frequency
to the sum of all previous frequencies."""
# Cumulative sum of the Series
statistic_data = pd.Series([10, 3.24, 5, 7, 9.2]) # Creating a Series
# print(statistic_data)
# print(statistic_data.cumsum())
"""aggregate(func): Aggregates using one or more operations. Example : 'sum', 'mean', 'std' """
# Aggregating using multiple operations
aggregated = statistic_data.aggregate(['sum', 'mean', 'std'])
# print(aggregated)
""" ################ DataFrame ################
A DataFrame is a two-dimensional labeled data structure (like a table or a spreadsheet) with rows and columns """
# Creating a DataFrame using a nested list (list of lists)
rows = [
["Narendra", 34, "Rajahmandry"],
["Manjula", 25, "Hyderabad"],
["Srikanth", 27, "Vijaywada"]
] # Note: rows data acts as values, & coulmns data acts as keys
dataframe = pd.DataFrame(rows, columns=["Name", "Age", "City"])
# print(dataframe, "\n") # it will convert to Tabular_data & bydefault it will provide index Ex: 0 1 2 ....etc
# Accessing Column data from DataFrame
# print(dataframe["Name"], "\n") # Prints a single column data based on
# print(dataframe[["Name", "Age"]]) # Prints multiple columns data
# print(dataframe[["Name", "City"]]) # Prints multiple columns data
"""Creating a DataFrame using a Dictionary"""
dict_data = {"Name":["Sri", "Meera", "Divya"],"Emp_ID":[2160, 2175,2183], "Specialization":["B.com", "B.C.A", "M.Sc"]}
dataframe = pd.DataFrame(dict_data)
# print(dataframe, "\n") # it will convert to Tabular_data & bydefault it will provide index Ex: 0 1 2 ....etc
""" Create a Custom Row Index to the dataframe"""
df_cust_index = pd.DataFrame(dict_data) # by default index will be printed as 0,1,2,3,.. for the data\frame
df_cust_index = pd.DataFrame(dict_data, index=['I', 'II', 'III'])
df_cust_index = pd.DataFrame(dict_data, index=['a', 'b', 'c'])
df_cust_index = pd.DataFrame(dict_data, index=['nanditha', 'Raahi', 'ganesh'])
# print(df_cust_index)
# Creating a DataFrame from a Dictionary
# Define data
Original_data = {
'Name': ['Narendra', 'Srikanth', 'Meera', 'Vinod', 'karthikesh', "Raahi", "Nanditha", "Venkat"],
'Email': ['narendra@brtechnosolutions.com', 'srikanth@brtechnosolutions.com', 'Meera@brtechnosolutions.com',
'vinod@brtechnosolutions.com', 'karthikesh@brtechnosolutions.com', "Raahi@brtechnosolutions.com",
"Nanditha@brtechnosolutions.com", "Venkat@brtechnosolutions.com"],
'Role': ['Founder', 'Growth Manager', 'Instructor','Course Designer', 'Placement Coordinator', "HR Executive","Office Administrator", "Digital Marketing Executive"],
'Phone Number': ['1111111111','2222222222', '3333333333', '4444444444', '5555555555'
, '6666666666', '7777777777', '8888888888']
}
# Display full DataFrame without column truncation
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None) # Allows full-width content to show
pd.set_option('display.expand_frame_repr', False) # Prevent wrapping
# Create DataFrame
stud_info_df = pd.DataFrame(Original_data, index =[101, 102, 103, 104, 105, 106, 107, 108])
# print(stud_info_df,"\n") # Display the complete DataFrame with all rows & coumns
#
# print(stud_info_df.head() ,"\n") # Display first 5 rows
# print(stud_info_df.tail(),"\n") # Display last 5 rows
""" ############## Accessing required data from Series & DataFrame ###################"""
"""Accessing data from series using Index (iloc[ ])"""
""""we can use iloc[ ] in 2 ways
1. index-based accessing
2. Label-based accessing """
# Indexing by position
accessing_of_series = pd.Series([24, 2, np.nan, 4, np.nan, None, 18],["a","b","c","d","e","f","g"])
# print(accessing_of_series,"\n")
# print(accessing_of_series.iloc[0]) # First element (Accessing data from series using Index)
# print(accessing_of_series.iloc[-1]) # Last element (Accessing data from series using Index)
# print(accessing_of_series.iloc[1:4]) # Accessing range of data from series using Index (iloc)
"""loc[ ]: Label-based accessing"""
# print(accessing_of_series.loc["a"]) # First element (Accessing data from series using Label)
# print(accessing_of_series.loc["g"]) # Last element (Accessing data from series using Label)
# print(accessing_of_series.loc['b':'d']) # Accessing range of data from series using labels
"""Accessing data from DataFrame using Index"""
input_df = pd.DataFrame({
'Name': ['Narendra', 'Meera', "Srikanth"],
'Age': [33, 20, 26],
'Qualification': ['M.Tech', 'B.C.A', "B.Com"],
'City': ['India', 'USA', 'India']
}) # by default it will provide the index 0,1, 2.....etc
# print(input_df,"\n")
# print(input_df.iloc[0]) # Accessing data from DataFrame using Index (iloc)
# print(input_df.iloc[0:2]) # Accessing range of data from DataFrame using Index (iloc)
"""Accessing data from DataFrame using label"""
input_df = pd.DataFrame({
'Name': ['Narendra', 'Meera', "Srikanth"],
'Age': [33, 20, 26],
'Qualification': ['M.Tech', 'B.C.A', "B.Com"],
'City': ['India', 'USA', 'India']
}, index =["A", "B", "C"])
# # print(input_df,"\n")
# print(input_df.loc["B"]) # Accessing data from DataFrame using label (loc)
# print(input_df.loc["A":"B"]) # Accessing range of data from DataFrame using label (loc)
"""s[s > n]: Filters and returns elements greater than n."""
# Create a Series
series_data = pd.Series([10, 25, 17, 40, 5, 33, 567, 229,33,55, 2, 4, 3])
# Filter elements greater than 20
num = 20
# filtered = series_data[series_data > num]
# print(filtered)
#
"""################ Merging DataFrame ################"""
"""In Pandas,the merge() function is used to combine two DataFrames based on a common column or index."""
# First DataFrame
df1 = pd.DataFrame({
'ID': [1, 2, 3, 5],
'Name': ['Sindhu', 'Anjela', 'Sangeeta', "raahi"]
})
# Second DataFrame
df2 = pd.DataFrame({
'ID': [1, 2, 4, 3],
'Score': [85, 90, 95,77]
})
# Merge based on ID
merged_df = pd.merge(df1, df2, on='ID', how='inner')
# on: Column name(s) to join on (must be in both)
# 'inner': Only matching rows (default)
# print(merged_df)
"""
################# Data Manipulation ###################
lambda(), map(), apply(), sort(values), drop(labels)
Lambda():
1)Lambda function is called a single-line-anonymous function.
2)We can use the lambda keyword to create small anonymous function.
3)Lambda forms can take any number of arguments but return just one value in the form of an expression.
They cannot contain commands or multiple expressions.
x = ((a+b)*c/d)%e
Function definition is here
variable = lambda arguments: operation
Syntax: variable = lambda arg1 [arg2,arg3,.....argn]:expression
"""
fac = lambda n: 1 if n==0 else n*fac(n-1)
# print(fac(5))
"""
apply():
apply() Similar to map, but more flexible. (It Can be used Data Frames as well, where as map is only for Series)
"""
data = pd.Series([64, 9, 25, 36])
# Applying a lambda function to series, to calculate square root (1/2 == 0.5) of all the numbers present in the series
func = lambda x: x ** 0.5
sqrt = data.apply(func) #square root always gives float values
# print(sqrt)
# Create a DataFrame with multiple Series
data = {
'A': [1, 2, 3],
'B': [4, 5, 6],
'C': [7, 8, 9]
}
num_data = pd.DataFrame(data)
# print(num_data)
# Define a function to sum two Series
def sum_series(x, y):
return x + y
# Apply the function on multiple Series using apply()
result = num_data.apply(lambda row: sum_series(row['A'], row['B']), axis=1)
# print(result) # Print the result
"""
sort_values(): Sorts the Series.
"""
element_sort = pd.Series([17, 4, 3, 27, 9, 6])
# Sorting the Series
sorted_s = element_sort.sort_values()
# print(sorted_s) # index will be constant even after sorted
""" df.drop()
The concept of "drop" usually refers to removing rows or columns from a DataFrame using the .drop() method.
Syntax: df.drop(labels, axis, inplace=False)
Syntax Parameters:
labels: Name(s) of the row(s) or column(s) to drop.
axis:
0 or 'index' → drop rows
1 or 'columns' → drop columns
inplace:
False (default) → returns a new DataFrame
True → modifies the original DataFrame
inplace=True parameter is used to modify the original DataFrame directly without creating a new one. """
""" Drop a colum by index """
input_df = pd.DataFrame({
'Name': ['Narendra', 'Meera', "Srikanth"],
'Age': [33, 20, 26],
'Qualification': ['M.Tech', 'B.C.A', "B.Com"],
'City': ['India', 'USA', 'India']
})
# print("New Data frame: \n",input_df)
# dropped_df = input_df.drop('City', axis=1) # Drops 'City' column
# print("Data frame after deleting column by label: \n",dropped_df)
""" Drop a row by index """
# dropped_df = input_df.drop(2, axis=0) # Drops the row with index 0
# print("Data frame after deleting column by index: \n", dropped_df)
input_df = pd.DataFrame({
'Name': ['Narendra', 'Meera', "Srikanth"],
'Age': [33, 20, 26],
'Qualification': ['M.Tech', 'B.C.A', "B.Com"],
'City': ['India', 'USA', 'India']
})
# print(input_df)
""" Drop multiple columns from Data Frame """
dropped_df = input_df.drop(['Age', 'City'], axis=1)
# print(dropped_df)
""" ############## Handling Missing Data ###################
Applying methods ( isnull(), notnull(), fillna(value), dropna()) for series & DataFrame """
"""isnull(): Checks for missing values, return boolean values, in both series & DataFrame """
output_series = pd.Series([1, 3, None, 7, 9])
# print(output_series.isnull()) # Checking for missing values from series
"""
output:
0 False
1 False
2 True
3 False
4 False
dtype: bool
"""
input_df = pd.DataFrame({
'Name': ['Narendra', 'Meera', None],
'Age': [33, None, 26],
'Qualification': ['M.Tech', 'B.C.A', "B.Com"],
'City': ['India', None, 'India']
})
output_df = input_df.isnull() # Checking for missing values from DataFrame
# print(output_df)
""" notnull(): Opposite of isnull() -->Checking for non-null values in Series & DataFrame """
# print(output_series.notnull()) # Checking for non-null values in Series
# print(input_df.notnull()) # Checking for non-null values in DataFrame
"""
output:
0 True
1 True
2 False
3 True
4 True
dtype: bool
"""
""" fillna(value): Fills missing values with a specified value in both series & DataFrame """
""" Fills missing values with a specified value in series"""
# Create a Series with missing values
input_series = pd.Series([1, 2, np.nan, 4, np.nan, None]) # nan means --> (NOT A NUMBER)
# print(input_series) # Print the Series
# Filling missing values with 9
filled_series = input_series.fillna(19) # none will be fill with required number Ex: 9
# print(filled_series)
""" Fills missing values with a specified value in DataFrame"""
input_df = pd.DataFrame({
'Name': ['Narendra', 'Meera', None],
'Age': [33, None, 26],
'Qualification': ['M.Tech', 'B.C.A', "B.Com"],
'City': ['India', None, 'India']
})
output_df = input_df.fillna(4) # removing 9 from fata frame in akl the places with requiored
# print(output_df)
""" dropna(): Drops all rows that contain missing values."""
# Creating a Series with missing values
# input_series = pd.Series([1, 2, np.nan, 4, np.nan, None,18]) # nan means --> (NOT A NUMBER)
# print(input_series) # Print the input Series just to see the difference before and after
# dropped_missing = input_series.dropna() # Dropping missing values temporarily removed from series
# print(dropped_missing) # Print the output Series after dropping missing values
input_df = pd.DataFrame({
'Name': ['Narendra', 'Meera', None],
'Age': [33, None, 26],
'Qualification': ['M.Tech', 'B.C.A', "B.Com"],
'City': ['India', None, 'India']
})
# print(input_df,"\n") # Print the input DataFrame just to see the difference before and after
dropped_missing = input_df.dropna() # Dropping missing values temporarily removed from DataFrame
# print(dropped_missing) # Print the output DataFrame after dropping missing values
===========================================================================
Pandas_part_2
===========================================================================__author__ = "Narendra Boyina"
""" Real-Time usage """
##### Creating Mini Project to import dataset #####
import pandas as pd
""" ###### Importing dataset ######
--> A simple way to store big data sets is to use CSV files (comma separated files).
--> CSV files contains plain text and is a well know format that can be read by everyone including Pandas.
"""
"""" Properties of dataframe
head(), tail(), shape """
""" head(n):
The df.head(n) method is used to view the first n rows of the DataFrame.
If you don't specify "n", the default number of rows displayed is 5.
tail(n):
The df.tail(n) method is similar to df.head(n) but for the end of the DataFrame.
It returns the last "n" rows.
shape :
The df.shape attribute of a DataFrame returns a tuple representing the dimensionality of the DataFrame.
The first element of the tuple is the number of rows, and the second is the number of columns.
"""
''' To get dataset from different sources like "Kaggle","Google Dataset" Search etc.'''
create_csv_file = pd.read_csv("clean_jobs.csv")
pd.set_option('display.max_rows', 500) # or None for all rows
pd.set_option('display.max_columns', 50)
# print(create_csv_file) # we can consider "create_csv_file" as an Object or Dataframe
# print(create_csv_file.head()) # By default 5 rows will be displayed from the top of the file
# print(create_csv_file.head(7)) # 7 rows will be displayed from the top of the file
# print(create_csv_file.tail()) # By default 5 rows will be displayed from the bottom of the file
# print(create_csv_file.tail(8)) # 8 rows will be displayed from the bottom of the file
# print(create_csv_file.shape) # will display no of rows & no of columns
# print(create_csv_file.columns) # Lists all the column names in the DataFrame
""" Inspecting Data Types:
Each column in a DataFrame has a specific data type.
Understanding these types is crucial for proper data manipulation."""
# print(create_csv_file.dtypes) # Display the data types of each column
"""
obj.loc: The obj.loc method is used for label-based indexing, meaning we can access
rows and columns using their labels (i.e., index names and column names)."""
# Selecting specified no of rows and a specific column by label
titles = create_csv_file.loc[2:7, 'location']
# print(titles)
# Selecting all rows and a specific column by label
titles = create_csv_file.loc[ : , 'title']
# print(titles)
# Selecting all rows and a specific column by label
titles = create_csv_file.loc[ : , 'company']
# print(titles)
# Selecting a range of rows and multiple columns by labels
subset = create_csv_file.loc[1:5, ['title', 'company','location']]
# print(subset)
"""
df.iloc: While obj.loc uses labels for indexing, df.iloc allows for integer-based indexing.
You use df.iloc to access rows and columns by their integer positions,
which makes it useful when you need to access data by its position in the DataFrame.
"""
# Selecting a single row from the DataFrame
# single_row = create_csv_file.iloc[0]
# print(single_row) # oth index (first row) data will be printed
# Selecting a specific row and columns by integer indices
# specific_data = create_csv_file.iloc[10, [1, 2, 3]] # row at index 10 and columns at indices 1, 2, and 3
# print(specific_data)
# Slicing to get multiple rows and columns
multi_rows_data_acess = create_csv_file.iloc[20:26, 0:4] # Rows 20 to 25 and columns 0 to 3 (26 and 4 is excluded)
# print(multi_rows_data_acess)
"""
df.at: obj.at is designed to access a single value for a row/column label pair.
It is very similar to df.loc for accessing scalar values but is optimized for
faster access when you only need to get or set a single value in a DataFrame."""
# Access a specific single value using "row index" and "column name"
title_of_first_data = create_csv_file.at[0, 'title']
# print(title_of_first_data)
# Access a specific single value using "row index" and "column name"
title_based_on_index = create_csv_file.at[14, 'title']
# print(title_based_on_index)
# Accessing data from a specific country
accessing_us_data = create_csv_file[create_csv_file['location'] == 'United States']
# print(accessing_us_data)
# Accessing data from a specific company
accessing_us_data = create_csv_file[create_csv_file['company'] == 'Meta']
#print(accessing_us_data)
"""
##### Updating Rows and Columns #######
df.drop:
The .drop() method in pandas is used to remove rows or columns from a DataFrame.
Its primary purpose is to drop specified labels from rows or columns.
Parameters:
labels: The row or column labels to drop.
axis: Specifies whether the labels refer to rows (axis=0) or columns (axis=1). By default, it's 0 (rows).
index or columns: An alternative way to specify the labels to drop, instead of using the labels parameter.
It is equivalent to specifying axis=0 (for index) or axis=1 (for columns).
inplace: If True, the operation is done in place, meaning it modifies the DataFrame directly and returns None.
If False or not specified, it returns a new DataFrame with the specified labels dropped. """
# print(create_csv_file.columns) # Lists all the column names in the DataFrame
# New_csv = create_csv_file.drop(labels='title',axis=1,inplace=False)
# print(New_csv.columns) # Lists all the column names in the DataFrame
"""Direct Assignment:
Directly assign a value to a specific column or even a cell in a DataFrame."""
create_csv_file.at[0, 'location'] = 'Hyderabad' # Changes the location of the first data to hyderabad
#print(create_csv_file.head(5))
create_csv_file['new_column'] = 'default value' # Adds a new column with all entries set to 'default value'
#print(create_csv_file)
create_csv_file.drop(axis=1,labels='new_column',inplace=True)
#print(create_csv_file)
""" Updating Using map or replace:
You can update a column based on a mapping dictionary or replace values."""
create_csv_file['title'].map({'Data Analyst': 'Data Analysts', 'Senior Data Analyst': 'Senior Data Analysts'}) # Mapping existing values to new ones
#print(create_csv_file)
#create_csv_file['location'].replace('San Francisco, CA', 'United States', inplace=True) # Replacing specific values
#print(create_csv_file.head(10)) # chtgpt ask
"""Changing the name of Index"""
"""Pandas allows you to rename the index of a DataFrame or Series, which can help in making the index more
informative or aligning it with new data requirements."""
# Renaming the Index of a DataFrame
#create_csv_file.index.names = ['job_id'] # Renames the index to 'movie_id'
#print(create_csv_file)
""" inplace=True """
""" In pandas, the inplace=True parameter is used in methods to modify the original DataFrame or Series directly,without creating a new object.
When inplace=True, the operation is performed on the same object, and no new object is returned.
This can save memory but requires caution as the original data is altered."""
create_csv_file.rename(columns={'job': 'job_title', 'company': 'company_name'}, inplace=True)
#print(create_csv_file)
""" Display Options """
#You can use pd.set_option() to modify how data is displayed.
""" Set maximum number of rows and columns to display """
# pd.set_option('display.max_rows', 7)
# pd.set_option('display.max_columns', 5)
#print(create_csv_file)
""" Reset Options """
pd.reset_option('display')
#print(create_csv_file)
################ Grouping Data ###############
genre_groups = create_csv_file.groupby('location') # Groups the data by the 'genre' column
# print(genre_groups)
# for location, group_data in genre_groups:
# print(f"Genre: {location}")
# print(group_data)
# print()Author: Boyina Narendra
Supporting Author: M. Meera Sindhu
Request: If you find this information useful, please provide your valuable comments