# Import libraries 
import os
import pandas as pd
import numpy as np
#import csv
import pathlib
#from collections import Counter
#import matplotlib.pyplot as plt
#import pyodbc
from datetime import datetime
import timeit
import re
from string import punctuation
import unidecode
import ast

# Ignore some warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None # ignore SettingWithCopyWarning

# Progression's check tool
from tqdm import tqdm, tqdm_notebook

# Custom 
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_colwidth', -1)

# Multi print import
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# System informations
import os
import platform
from platform import python_version
import jupyterlab
import numpy as np
import pandas as pd
print("os name: %s" %
print("system: %s" % platform.system())
print("release: %s" % platform.release())
print("version: %s" % python_version())
print("Python Packages")
print("jupterlab==%s" % jupyterlab.__version__)
print("pandas==%s" % pd.__version__)
print("numpy==%s" % np.__version__)
# SQL Conenction
conn = pyodbc.connect('DSN=my_DB;UID=my_user;PWD=my_password')

# Extract data
This code contains two functions : Overview plot & Overview table. I published the code on GitHub and on Kaggle. Feel free to use it, upvote if you like it, and share yours ! 🙂

Overview plot : The first displays the correlation of the dataset variables with your target variable. In my example SalePrice of houses, from Kaggle dataset.

Overview table : The second function displays the different values existing in each of the columns. For exemple third row first column, RL [1151 = 78,84%], where RL is the value, 1151 its occurence and 78,84% its percentage.

How to systematically remove collinear variables in Python?

Source :

This checks VIF values and then drops variables whose VIF is more than 5.

from statsmodels.stats.outliers_influence import variance_inflation_factor    

def calculate_vif_(X, thresh=5.0):
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                  '\' at index: ' + str(maxloc))
            del variables[maxloc]
            dropped = True

    print('Remaining variables:')
    return X.iloc[:, variables]

Regression : linear regression, Support Vector Regression (SVR), and regression trees

Classification : logistic regression, Naïve Bayes, decision trees, and K Nearest Neighbors, Decision trees, kernel approximation

The 10 Best Machine Learning Algorithms for Data Science Beginners

Jupyter notebook offers possibilities as wide as unsuspected. Here is a compliation of links to articles that deal with this subject :

Executing shell commands, splitting notebook cells, collapsing heading, Qgrid (dynamic table as Excel), Slide shows (fixed, or interactive), embedding contents (url, pdf, youtube video, etc), or interactive widgets :


Environment switching, plus a list of 9 useful extensions, but without examples:


Profile report of a dataframe, interactive plot with plotly, and useful magic commands:


Variable inspector, execute time, hide code input:


Notify, code folding, debug:


Much more to explore here:


Qgrid demo and more:


Unofficial Jupyter Notebook Extensions page:


