Predict DL Experiments (python tutorial)

18 minute read

Decide Economic Index

Python Tutorial

Install xlrd

pip3 install xlrd

Read Excel

xl = pd.ExcelFile("dummydata.xlsx")
xl.sheet_names
>>[u'Sheet1', u'Sheet2', u'Sheet3']

df = xl.parse("Sheet1")
df.head()

parsed = pd.io.parsers.ExcelFile.parse(xl, "Sheet1")
parsed.columns

import pandas
df = pandas.read_excel(open('your_xls_xlsx_filename','rb'), sheetname='Sheet 1')

# or using sheet index starting 0
df = pandas.read_excel(open('your_xls_xlsx_filename','rb'), sheetname=2)

import pandas as pd
# open the file
xlsx = pd.ExcelFile(FileName.xlsx)
# get the first sheet as an object
sheet1 = xlsx.parse(0)
# get the first column as a list you can loop through
# where the is 0 in the code below change to the row or column number you want    
column = sheet1.icol(0).real
# get the first row as a list you can loop through
row = sheet1.irow(0).real

import pandas as pd
# Read the excel sheet to pandas dataframe
DataFrame = pd.read_excel("FileName.xlsx", sheetname=0)

read excel with specific row, column

import pandas as pd

# define the file name and "sheet name"
fn = 'Book1.xlsx'
sn = 'Sheet1'

data = pd.read_excel(fn, sheetname=sn, index_col=0, skiprows=1, header=0, skip_footer=1)

Transpose column

>>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
>>> df1 = pd.DataFrame(data=d1)
>>> df1
   col1  col2
0     1     3
1     2     4

>>> df1_transposed = df1.T # or df1.transpose()
>>> df1_transposed
      0  1
col1  1  2
col2  3  4

Rename Column

>>> df = pd.DataFrame({'$a':[1,2], '$b': [10,20]})
>>> df.columns = ['a', 'b']
>>> df
   a   b
0  1  10
1  2  20

df.rename(columns={'pop':'population',
                          'lifeExp':'life_exp',
                          'gdpPercap':'gdp_per_cap'}, 
                 inplace=True)

Create DateTimeIndex in Pandas

import datetime as dt
import pandas as pd

df = pd.DataFrame({'year': [2015, 2016], 
                   'month': [12, 1], 
                   'day': [31, 1], 
                   'hour': [23, 1]})

# returns datetime objects
df['Timestamp'] = df.apply(lambda row: dt.datetime(row.year, row.month, row.day, row.hour), 
                           axis=1)

# converts to pandas timestamps if desired
df['Timestamp'] = pd.to_datetime(df.Timestamp)

>>> df
   day  hour  month  year           Timestamp
0   31    23     12  2015 2015-12-31 23:00:00
1    1     1      1  2016 2016-01-01 01:00:00

# Create a DatetimeIndex and assign it to the dataframe.
df.index = pd.DatetimeIndex(df.Timestamp)

>>> df
                     day  hour  month  year           Timestamp
2015-12-31 23:00:00   31    23     12  2015 2015-12-31 23:00:00
2016-01-01 01:00:00    1     1      1  2016 2016-01-01 01:00:00

How to extract specific content in a pandas dataframe with a regex?

#convert column to string
df['movie_title'] = df['movie_title'].astype(str)

#but it remove numbers in names of movies too
df['titles'] = df['movie_title'].str.extract('([a-zA-Z ]+)', expand=False).str.strip()
df['titles1'] = df['movie_title'].str.split('(', 1).str[0].str.strip()
df['titles2'] = df['movie_title'].str.replace(r'\([^)]*\)', '').str.strip()
print df
          movie_title      titles      titles1      titles2
0  Toy Story 2 (1995)   Toy Story  Toy Story 2  Toy Story 2
1    GoldenEye (1995)   GoldenEye    GoldenEye    GoldenEye
2   Four Rooms (1995)  Four Rooms   Four Rooms   Four Rooms
3   Get Shorty (1995)  Get Shorty   Get Shorty   Get Shorty
4      Copycat (1995)     Copycat      Copycat      Copycat

value = re.sub(r"[^0-9]+", "", value)
df['pricing'] = re.sub(r"[^0-9]+", "", df['pricing'])
df['Pricing'].replace(to_replace='[^0-9]+', value='',inplace==True,regex=True) 

import pandas as pd

df = pd.DataFrame(['$40,000*','$40000 conditions attached'], columns=['P'])
print(df)
#                             P
# 0                    $40,000*
# 1  $40000 conditions attached

df['P'] = df['P'].str.replace(r'\D+', '').astype('int')
print(df)
#yields
       P
0  40000
1  40000

Regular expression to extract numbers from a string

^     # start of string
\s*   # optional whitespace
(\w+) # one or more alphanumeric characters, capture the match
\s*   # optional whitespace
\(    # a (
\s*   # optional whitespace
(\d+) # a number, capture the match
\D+   # one or more non-digits
(\d+) # a number, capture the match
\D+   # one or more non-digits
\)    # a )
\s*   # optional whitespace
$     # end of string

[^0-9]+([0-9]+)[^0-9]+([0-9]+).+

Delete column from pandas DataFrame

del df['column_name']

Data Interpolation

from scipy import interpolate
from scipy.optimize import fsolve
import math

x = np.array([10,20,30,40,50])
y = np.array([0.2,0.6,-0.2,-0.5,0.7])

tck = interpolate.splrep(x, y, s=0)

xnew = np.arange(10,50,1)
ynew = interpolate.splev(xnew, tck, der=0)
# ynewder1 = interpolate.splev(xnew, tck, der=1)
# ynewder2 = interpolate.splev(xnew, tck, der=2)

plt.scatter(xnew,ynew)

How can I replace all the NaN values with Zero’s in a column of a pandas dataframe

df[1].fillna(0, inplace=True)

How to add an empty column to a dataframe?

df = pd.DataFrame({"A": [1,2,3], "B": [2,3,4]})
df
Out[18]:
   A  B
0  1  2
1  2  3
2  3  4

df.assign(C="",D=np.nan)
Out[21]:
   A  B C   D
0  1  2   NaN
1  2  3   NaN
2  3  4   NaN

Pandas add one day to column

montdist['date'] + pd.DateOffset(1)
pd.DatetimeIndex(df.date) + pd.offsets.Hour(1)
mondist['shifted_date']=mondist.date + datetime.timedelta(days=1)
df['newdate'] = pd.to_datetime(df['date']).apply(pd.DateOffset(1))
df['newdate'] = pd.Series(index=df.index).tshift(periods=1, freq='D').index

covert pandas index to datetime index

import re
import datetime as dt

indexstrs = df_train.index
#indexstrs[0] = '2001-01-01'

years = [ int(re.sub(r'\D+','',rowstr.split('-')[0])) for rowstr in indexstrs] 
months = [ int(re.sub(r'\D+','',rowstr.split('-')[1])) for rowstr in indexstrs] 
days = [ int(re.sub(r'\D+','',rowstr.split('-')[2])) for rowstr in indexstrs] 

indexnewstrs = [dt.datetime(years[i],months[i],days[i]) for i in range(len(years))]
#indexnewstrs
df_train['timeindex'] = indexnewstrs
df_train.index = pd.DatetimeIndex(indexnewstrs)
del df_train['timeindex']
df_train.head()

prevtpdate = df_train.index[0]
nexttpdate = df_train.index[0]
reward = 0
for curdate in df_train.index:
    if curdate >= nexttpdate and prevtpdate < nexttpdate:
        prevtpdate = nexttpdate
    for _idx in df_turnpoints.index:
        if curdate < _idx:
            nexttpdate = _idx
#             print("prevtpdate {} curdate {} nexttpdate {}".format(prevtpdate,curdate,nexttpdate))
            break
    curdatetostr = curdate.strftime('%Y-%m-%d')
    nexttptostr = nexttpdate.strftime('%Y-%m-%d')
    nextval = df_train[nexttptostr:nexttptostr]['Close'].values[0]
    curval = df_train[curdatetostr:curdatetostr]['Close'].values[0]
    reward = nextval - curval
    print("reward {}".format(reward))

Pandas: Convert Timestamp to datetime.date

In [11]: t = pd.Timestamp('2013-12-25 00:00:00')

In [12]: t.date()
Out[12]: datetime.date(2013, 12, 25)

In [13]: t.date() == datetime.date(2013, 12, 25)
Out[13]: True

datetime to string with series in python pandas

date = dataframe.index #date is the datetime index
date = dates.strftime('%Y-%m-%d') #this will return you a numpy array, element is string.
dstr = date.tolist()

Python NumPy: Get the values and indices of the elements that are bigger than 10 in a given array

import numpy as np
x = np.array([[0, 10, 20], [20, 30, 40]])
print("Original array: ")
print(x)
print("Values bigger than 10 =", x[x>10])
print("Their indices are ", np.nonzero(x > 10))

Original array:                                                        
[[ 0 10 20]                                                            
 [20 30 40]]                                                           
Values bigger than 10 = [20 20 30 40]                                  
Their indices are  (array([0, 1, 1, 1]), array([2, 0, 1, 2]))

add datetimeindex in the other datetime index

for i in range(data_preidxintrp.shape[0]):
    basestr = data_preidxintrp.index[i]
    basevalue = data_preidxintrp['value'][i]
    
    if basestr not in dfkospinew.index:
    
        while(True):
            if basestr in dfkospinew.index:
                basestr_timestamptostr = basestr.strftime('%Y-%m-%d')
                dfkospinew[basestr_timestamptostr:basestr_timestamptostr] = basevalue
                break
            basestr += pd.DateOffset(1)

numpy.zeros() in Python

# Python Program illustrating
# numpy.zeros method
 
import numpy as geek
 
b = geek.zeros(2, dtype = int)
print("Matrix b : \n", b)
 
a = geek.zeros([2, 2], dtype = int)
print("\nMatrix a : \n", a)
 
c = geek.zeros([3, 3])
print("\nMatrix c : \n", c)

Find the B-spline representation of 1-D curve (Interpolation)

import matplotlib.pyplot as plt
from scipy.interpolate import splev, splrep
x = np.linspace(0, 10, 10)
y = np.sin(x)
spl = splrep(x, y)
x2 = np.linspace(0, 10, 200)
y2 = splev(x2, spl)
plt.plot(x, y, 'o', x2, y2)
plt.show()

Deleting multiple columns based on column names in Pandas

yourdf.drop(['columnheading1', 'columnheading2'], axis=1, inplace=True)

for col in df.columns:
    if 'Unnamed' in col:
        del df[col]

df.drop([col for col in df.columns if "Unnamed" in col], axis=1, inplace=True)

df.drop(df.columns[22:56], axis=1, inplace=True)        

How to add column to numpy array

my_data = np.random.random((210,8)) #recfromcsv('LIAB.ST.csv', delimiter='\t')
new_col = my_data.sum(1)[...,None] # None keeps (n, 1) shape
new_col.shape
#(210,1)
all_data = np.append(my_data, new_col, 1)
all_data.shape
#(210,9)

all_data = np.hstack((my_data, new_col))
#or
all_data = np.concatenate((my_data, new_col), 1)

Numpy expand dims

>>> y = np.expand_dims(x, axis=0)
>>> y
array([[1, 2]])
>>> y.shape
(1, 2)

>>>

>>> y = np.expand_dims(x, axis=1)  # Equivalent to x[:,np.newaxis]
>>> y
array([[1],
       [2]])
>>> y.shape
(2, 1)

Keras: How to save model and continue training?

model.save('partly_trained.h5')
del model
load_model('partly_trained.h5')

filepath="LPT-{epoch:02d}-{loss:.4f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
# fit the model
model.fit(x, y, epochs=60, batch_size=50, callbacks=callbacks_list)

try:
    model.load_weights(path_checkpoint)
except Exception as error:
    print("Error trying to load checkpoint.")
    print(error)

self.saver = tf.train.Saver()

model_name = self.model_name + ".model"
self.saver.save(
    self.sess,
    os.path.join(self.model_logs_dir, model_name),
    global_step=step
)

ckpt = tf.train.get_checkpoint_state(self.model_logs_dir)
if ckpt and ckpt.model_checkpoint_path:
    ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
    self.saver.restore(self.sess, os.path.join(self.model_logs_dir, ckpt_name))

How to prevent tensorflow from allocating the totality of a GPU memory?

# Assume that you have 12GB of GPU memory and want to allocate ~4GB:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

Build Multiple Layer LSTM network

"[p0,p1,p3][p4,p5,p6] => 2 num steps, 3 input size"
"[p7,p8,p9] => 3 output size"
self.inputs = tf.placeholder(tf.float32, [None, self.num_steps, self.input_size], name="inputs")
self.targets = tf.placeholder(tf.float32, [None, self.input_size], name="targets")

def _create_one_cell():
    lstm_cell = tf.contrib.rnn.LSTMCell(self.lstm_size, state_is_tuple=True)
    lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob)
    return lstm_cell

cell = tf.contrib.rnn.MultiRNNCell(
    [_create_one_cell() for _ in range(self.num_layers)],
    state_is_tuple=True
    )
val, state_ = tf.nn.dynamic_rnn(cell, self.inputs_with_embed, dtype=tf.float32, scope="dynamic_rnn")

ws = tf.Variable(tf.truncated_normal([self.lstm_size, self.input_size]), name="w")
bias = tf.Variable(tf.constant(0.1, shape=[self.input_size]), name="b")
self.pred = tf.matmul(last, ws) + bias

Normalization seq data

seq = [np.array([520.95]), np.array([521.43]), np.array([558.02]), np.array([580.85]), np.array([586.65]), np.array([589.92])]

print('seq[0] {} seq[0][0] {}'.format(seq[0],seq[0][0]))

seq = [seq[0] / seq[0][0] - 1.0] + [
    curr / seq[i][-1] - 1.0 for i, curr in enumerate(seq[1:])]

seq

>> seq[0] [520.95] seq[0][0] 520.95

Out[19]:

[array([0.]),
 array([0.00092139]),
 array([0.07017241]),
 array([0.04091251]),
 array([0.00998537]),
 array([0.00557402])]

numpy arrary to list

>>> a = np.array([1, 2])
>>> a.tolist()
[1, 2]
>>> a = np.array([[1, 2], [3, 4]])
>>> list(a)
[array([1, 2]), array([3, 4])]
>>> a.tolist()
[[1, 2], [3, 4]]

Show DataFrame as table in iPython Notebook

from IPython.display import display, HTML

# Assuming that dataframes df1 and df2 are already defined:
print "Dataframe 1:"
display(df1)
print "Dataframe 2:"
display(HTML(df2.to_html()))

Tensor manipulation

t = np.array([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.], [10., 11., 12.]])
pp.pprint(t)
print(t.ndim) # rank
print(t.shape) # shape

t = tf.constant([1,2,3,4])
tf.shape(t).eval()
>>array([4], dtype=int32)


>>array([ 1.5,  3.5], dtype=float32)

matrix1 = tf.constant([[3., 3.]])
matrix2 = tf.constant([[2., 2.]])
(matrix1+matrix2).eval()
>>array([[ 5.,  5.]], dtype=float32)

tf.random_normal([3]).eval()
>>array([ 2.20866942, -0.73225045,  0.33533147], dtype=float32)

x = [[1., 2.],
     [3., 4.]]
tf.reduce_mean(x).eval()
>>2.5

tf.reduce_mean(x, axis=0).eval()
>>array([ 2.,  3.], dtype=float32)

tf.reduce_mean(x, axis=1).eval()
>>array([ 1.5,  3.5], dtype=float32)

x = [[0, 1, 2],
     [2, 1, 0]]
tf.argmax(x, axis=0).eval()
>>array([1, 0, 0])

tf.argmax(x, axis=1).eval()
>>array([2, 0])

t = np.array([[[0, 1, 2], 
               [3, 4, 5]],
              
              [[6, 7, 8], 
               [9, 10, 11]]])
t.shape
>>(2, 2, 3)

tf.squeeze([[0], [1], [2]]).eval()
>>array([0, 1, 2], dtype=int32)

tf.expand_dims([0, 1, 2], 1).eval()
>>array([[0],
       [1],
       [2]], dtype=int32)

tf.one_hot([[0], [1], [2], [0]], depth=3).eval()
>>array([[[ 1.,  0.,  0.]],

       [[ 0.,  1.,  0.]],

       [[ 0.,  0.,  1.]],

       [[ 1.,  0.,  0.]]], dtype=float32)

tf.cast([1.8, 2.2, 3.3, 4.9], tf.int32).eval()
>>array([1, 2, 3, 4], dtype=int32)

x = [1, 4]
y = [2, 5]
z = [3, 6]

# Pack along first dim.
tf.stack([x, y, z]).eval()
>>array([[1, 4],
       [2, 5],
       [3, 6]], dtype=int32)

tf.stack([x, y, z], axis=1).eval()
>>array([[1, 2, 3],
       [4, 5, 6]], dtype=int32)

Numpy squeeze

>>> x = np.array([[[0], [1], [2]]])
>>> x.shape
(1, 3, 1)
>>> np.squeeze(x).shape
(3,)
>>> np.squeeze(x, axis=0).shape
(3, 1)
>>> np.squeeze(x, axis=1).shape
Traceback (most recent call last):
...
ValueError: cannot select an axis to squeeze out which has size not equal to one
>>> np.squeeze(x, axis=2).shape
(1, 3)

import tensorflow as tf

a, b, c = 2, 3, 4
x = tf.Variable(tf.random_normal([a, b, c], mean=0.0, stddev=1.0, dtype=tf.float32))
s = tf.shape(x)

init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
v1, v2, v3 = sess.run(s)
y = tf.reshape(x, [v1 * v2, v3])
shape = tf.shape(y)

print (sess.run(y))
print (sess.run(shape))

a = tf.constant([[30, 29, 19, 17, 12, 11],
   [30, 27, 20, 16,  5,  1],
   [28, 25, 17, 14,  7,  2],
   [28, 26, 21, 14,  6,  4]], dtype=tf.int32)
print(a.get_shape())
>>(4, 6)
a = tf.expand_dims(a, axis=2)
print(a.get_shape())
>>(4, 6, 1)

sess = tf.InteractiveSession()
init = tf.global_variables_initializer()
sess.run(init)
y_pred = sess.run([Y_pred], feed_dict={
                                X: trainX, Y: trainY})
print(y_pred[0].shape,type(y_pred))
new_y_pred = y_pred[0]
new_y_pred = tf.expand_dims(new_y_pred,axis=0)
print(new_y_pred.get_shape())
axis1 = new_y_pred.get_shape()[1]
print(int(int(axis1)/5))
axis1 = int(int(axis1)/5)
#new_y_pred = tf.reshape(new_y_pred,[])
new_y_pred = tf.reshape(new_y_pred,[axis1,5,1])
print(new_y_pred.get_shape())
>>(3005, 1) "<class 'list'>"
>>(1, 3005, 1)
>>601
>>(601, 5, 1)

matplotlib legend

plt.figure(figsize=(12, 6))
plt.plot(days, truths, label='truth')
plt.plot(days, preds, label='pred')
plt.legend(loc='upper left', frameon=False)
plt.xlabel("day")
plt.ylabel("normalized price")
plt.ylim((min(truths), max(truths)))
plt.grid(ls='--')
plt.savefig(figname, format='png', bbox_inches='tight')#, transparent=True)

Adding new column to existing DataFrame in Python pandas

df1['e'] = pd.Series(np.random.randn(sLength), index=df1.index)

>>> df1.loc[:,'f'] = p.Series(np.random.randn(sLength), index=df1.index)
>>> df1
          a         b         c         d         e         f
6 -0.269221 -0.026476  0.997517  1.294385  1.757167 -0.050927
8  0.917438  0.847941  0.034235 -0.448948  2.228131  0.006109
>>> 

df1 = df1.assign(e=p.Series(np.random.randn(sLength)).values)

numpy generating random sin curve

import matplotlib.pylab as plt
>>> x = np.linspace(-np.pi, np.pi, 201)
>>> plt.plot(x, np.sin(x))
>>> plt.xlabel('Angle [rad]')
>>> plt.ylabel('sin(x)')
>>> plt.axis('tight')
>>> plt.show()

calculate turning points / pivot points in trajectory (path)

import numpy as np
import matplotlib.pyplot as plt
import scipy.ndimage

def first_derivative(x) :
    return x[2:] - x[0:-2]

def second_derivative(x) :
    return x[2:] - 2 * x[1:-1] + x[:-2]

def curvature(x, y) :
    x_1 = first_derivative(x)
    x_2 = second_derivative(x)
    y_1 = first_derivative(y)
    y_2 = second_derivative(y)
    return np.abs(x_1 * y_2 - y_1 * x_2) / np.sqrt((x_1**2 + y_1**2)**3)

def plot_turning_points(x, y, turning_points=10, smoothing_radius=3,
                        cluster_radius=10) :
    if smoothing_radius :
        weights = np.ones(2 * smoothing_radius + 1)
        new_x = scipy.ndimage.convolve1d(x, weights, mode='constant', cval=0.0)
        new_x = new_x[smoothing_radius:-smoothing_radius] / np.sum(weights)
        new_y = scipy.ndimage.convolve1d(y, weights, mode='constant', cval=0.0)
        new_y = new_y[smoothing_radius:-smoothing_radius] / np.sum(weights)
    else :
        new_x, new_y = x, y
    k = curvature(new_x, new_y)
    turn_point_idx = np.argsort(k)[::-1]
    t_points = []
    while len(t_points) < turning_points and len(turn_point_idx) > 0:
        t_points += [turn_point_idx[0]]
        idx = np.abs(turn_point_idx - turn_point_idx[0]) > cluster_radius
        turn_point_idx = turn_point_idx[idx]
    t_points = np.array(t_points)
    t_points += smoothing_radius + 1
    plt.plot(x,y, 'k-')
    plt.plot(new_x, new_y, 'r-')
    print('t_points {}'.format(t_points))
    plt.plot(x[t_points], y[t_points], 'o')
    plt.show()
#x, y = np.genfromtxt('bla.data')
y = np.array([0,2,3,4,5,2,1,2,3,4,5,6,7,8,7,6,5,4,5,6])

#x = np.arange(len(y))
x = np.linspace(-np.pi, np.pi, 50)
y = np.sin(x)*np.random.randint(10,size=1)

#print(x,y)
plot_turning_points(x, y, turning_points=20, smoothing_radius=1,cluster_radius=10)

The Ramer-Douglas-Peucker algorithm implemented in Python

Ramer–Douglas–Peucker algorithm

numpy.asmatrix

>>> x = np.array([[1, 2], [3, 4]])
>>> m = np.asmatrix(x)
>>> x[0,0] = 5
>>> m
matrix([[5, 2],
        [3, 4]])

numpy.hstack

>>> a = np.array((1,2,3))
>>> b = np.array((2,3,4))
>>> np.hstack((a,b))
array([1, 2, 3, 2, 3, 4])
>>> a = np.array([[1],[2],[3]])
>>> b = np.array([[2],[3],[4]])
>>> np.hstack((a,b))
array([[1, 2],
       [2, 3],
       [3, 4]])

tensorflow saver and restore

Zsaver = tf.train.Saver()
save_dir = 'checkpoints/'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

save_path = os.path.join(save_dir, 'best_validation')

saver.save(sess=session, save_path=save_path)

saver.restore(sess=session, save_path=save_path)    

tensorflow global step variable

tf.summary.scalar('loss',loss)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)

# Why 0 as the first parameter of the global_step tf.Variable?
global_step = tf.Variable(0, name='global_step',trainable=False)

train_op = optimizer.minimize(loss, global_step=global_step)

TensorFlow-Tutorials/10_save_restore_net.py

global_step = tf.Variable(0, name='global_step', trainable=False)

ckpt = tf.train.get_checkpoint_state(ckpt_dir)
if ckpt and ckpt.model_checkpoint_path:
    print(ckpt.model_checkpoint_path)
    saver.restore(sess, ckpt.model_checkpoint_path) # restore all variables

start = global_step.eval() # get last global_step
print("Start from:", start)
"train "
global_step.assign(i).eval() # set and update(eval) global_step with index, i
saver.save(sess, ckpt_dir + "/model.ckpt", global_step=global_step)

Search A pandas Column For A Value

df['preTestScore'].where(df['postTestScore'] > 50)

Selection and Indexing Methods for Pandas DataFrames

12 Useful Pandas Techniques in Python for Data Manipulation

Pandas Cheat Sheet for Data Science in Python

making matplotlib scatter plots from dataframes in Python’s pandas

import matplotlib.pylab as plt
# df is a DataFrame: fetch col1 and col2 
# and drop na rows if any of the columns are NA
mydata = df[["col1", "col2"]].dropna(how="any")
# Now plot with matplotlib
vals = mydata.values
plt.scatter(vals[:, 0], vals[:, 1])

mydata = df.dropna(how="any", subset=["col1", "col2"])
# plot a scatter of col1 by col2, with sizes according to col3
scatter(mydata(["col1", "col2"]), s=mydata["col3"])

Pandas Dataframe: Plot Examples with Matplotlib and Pyplot

scatter plot

buysignals = df_result['actions'].where(df_result['actions'] == 1)
sellsignals = df_result['actions'].where(df_result['actions'] == 0)
holdsignals = df_result['actions'].where(df_result['actions'] == 2)

df_result['buy'] = buysignals
df_result['sell'] = sellsignals
df_result['hold'] = holdsignals

df_result['buy'] = df_result['Close'].where(df_result['buy'] ==1)
df_result['sell'] = df_result['Close'].where(df_result['sell'] == 0)
df_result['hold'] = df_result['Close'].where(df_result['hold'] == 2)
df_result['datetime'] = df_result.index

import matplotlib, datetime
import matplotlib.pyplot as plt

def scatter_date(df, x, y, datetimeformat):
  if not isinstance(y, list):
      y = [y]
  for yi in y:
      plt.plot_date(df[x].apply(
          lambda z: matplotlib.dates.date2num(
              datetime.datetime.strptime(z, datetimeformat))), df[yi], label=yi)
  plt.legend()
  plt.xlabel(x)

# Example Usage
plt.figure(figsize=(10,10))
scatter_date(df_result, x='datetime', y=['Close','buy','sell','hold'], datetimeformat='%Y-%m-%d')

pandas access value by index number

In [28]: x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})

In [29]: x.iloc[1] = dict(x=9, y=99)

In [30]: x
Out[30]: 
   x   y
0  1   3
1  9  99
2  3   5

pandas zero column creation and insert value

df_total['turnpoint'] = np.zeros(len(df_total))
df_total['turnpoint'].iloc[newx[1]] = newy[1]
df_total['turnpoint'].iloc[newx[1]]

Set value for particular cell in pandas DataFrame using index

df.loc[df[<some_column_name>] == <condition>, <another_column_name>] = <value_to_add>
df.loc[row_index,col_indexer] = value
df.iloc[[2], [0]] = 10
x = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
x.iloc[1] = dict(A=10, B=-10)

Pandas sort

df = df.sort_values(['c1','c2'], ascending=[False,True])

pandas convert string to integer

s = lambda f: f.replace(',','')
dftotalname['A'] = dftotalname['A'].apply(s).astype(int)

pandas reset index

df = df.reset_index(drop=True)

How can I fill out a Python string with spaces?

print  ('%06d'%123)
>> '000123'
'hi'.ljust(10)
>>'hi        '
'{0: <6}'.format('Hi')
>>'Hi   '

Read all files in the Directory

import glob   
path = '/home/mypc/download/*.html'   
files=glob.glob(path)   
for file in files:     
    f=open(file, 'r')  
    print '%s' % f.readlines()   
    f.close()

Moving Average- Pandas

MovingAverage = pd.rolling_mean(Exchange,5)
df['MA'] = df.rolling(window=5).mean()

pandas df.pct_change()

df.pct_change()

In PANDAS, how to get the index of a known value?

In [48]: a
Out[48]: 
   c1  c2
0   0   1
1   2   3
2   4   5
3   6   7
4   8   9

In [49]: a.c1[a.c1 == 8].index.tolist()
Out[49]: [4]

In [25]: a.loc[a['c1'] == 8].index[0]
Out[25]: 4

In [17]: a.set_index('c1').index.get_loc(8)
Out[17]: 4

import numpy as np
import pandas as pd

In [800]: df = pd.DataFrame(np.arange(10).reshape(5,2),columns=['c1','c2'])

In [801]: df
Out[801]: 
   c1  c2
0   0   1
1   2   3
2   4   5
3   6   7
4   8   9

In [802]: np.where(df["c1"]==6)
Out[802]: (array([3]),)

In [803]: indices = list(np.where(df["c1"]==6)[0])

In [804]: df.iloc[indices]
Out[804]: 
   c1  c2
3   6   7

In [805]: df.iloc[indices].index
Out[805]: Int64Index([3], dtype='int64')

In [806]: df.iloc[indices].index.tolist()
Out[806]: [3]

matplotlib datetime plot

import matplotlib, datetime
import matplotlib.pyplot as plt
datetimeformat='%Y-%m-%d'

x= [matplotlib.dates.date2num(datetime.datetime.strptime(idx.strftime('%Y-%m-%d'),datetimeformat)) for idx in _y.index]

y = dfstock['Close'][_y.index]

plt.plot_date(x,y)

Python : How to Merge / Join two or more lists

def main():
    list1 = ["This" , "is", "a", "sample", "program"]
    list2 = [10, 2, 45, 3, 5, 7, 8, 10]
    
    print("list1 : ", list1)
    print("list2 : ", list2)
    finalList = list1 + list2
    print("Merged List : " , finalList)
    list1.extend(list2)
    print("extended list1  : " , list1)
    list1 = ["This" , "is", "a", "sample", "program"]
    list2 = [10, 2, 45, 3, 5, 7, 8, 10]
    list3 = [11, 12, 13]
    finalList = list1 + list2 + list3
    print("Merged List : " , finalList)
if __name__ == '__main__':
    main()

Pandas data copy

import copy
df_result =copy.deepcopy(df_test[SEQ_SIZE:])

PP print example

import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(cm)

How to remove specific elements in a numpy array

import numpy as np

a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
index = [2, 3, 6]

new_a = np.delete(a, index)

print(new_a) #Prints `[1, 2, 5, 6, 8, 9]`

How To Concatenate Arrays in NumPy?

array2D_1 = array.reshape((3,3))
array2D_1
 
array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

array2D_2 = np.arange(10,19).reshape(3,3)
array2D_2
 
array([[10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

np.concatenate((array2D_1,array2D_2))
 
array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

np.vstack((array2D_1, array2D_2))
 
array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

np.hstack((array2D_1, array2D_2))
 
array([[ 0,  1,  2, 10, 11, 12],
       [ 3,  4,  5, 13, 14, 15],
       [ 6,  7,  8, 16, 17, 18]])

np.append([[1, 2, 3], [4, 5, 6]], [[7, 8, 9]], axis=0)
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

Pandas merged

ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F

df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
display('df1', 'df2', 'pd.concat([df1, df2])')

Pandas Merge, join, and concatenate

Keras load model

try:
    # load json and create model
    json_file = open(save_dir+'model.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    # load weights into new model
    model.load_weights(filename)
    print("Loaded model from disk")
except :
    PrintException()

model.compile(loss='mse', optimizer='adam', metrics=['mse'])

tensorflow log level turnoff

tf.logging.set_verbosity(tf.logging.ERROR)

ipython notebook module reload

import importlib
importlib.reload(pp)

How to deal with SettingWithCopyWarning in Pandas?

df[df['A'] > 2]['B'] = new_val  # new_val not set in df
df.loc[df['A'] > 2, 'B'] = new_val

df = df[df['A'] > 2]
df['B'] = new_val

pd.options.mode.chained_assignment = None  # default='warn'

df.ix[:,'Score']
# Select 2nd row and 3rd column value
 
df.iloc[1,2]

# select 1st and 4thcolumn
df.iloc[:,[0,3]]

# select first 2 columns
 
df.iloc[:,:2]

df.loc[[1,2,3,4,5],['Name','Score']]

python reversed enumerate

for _fix,_val in reversed(list(enumerate(_holdlist))):
    print(_fix,_val)

tensorflow memory config

config = tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allocator_type = 'BFC'
config.gpu_options.per_process_gpu_memory_fraction = 0.40
config.gpu_options.allow_growth = True

Multiple Models load

import glob   
path = ''
dirnames=glob.glob(path)   
model_len = len(dirnames)
models = []
for dirname in dirnames:     

    graph = tf.Graph()
    sess = tf.Session(graph=graph)
    with graph.as_default():
        model = modelclass(sess,...)
        model.load()
        models.append(model)

File delete or remove in directory

files = glob.glob('./23/*')
for f in files:
    os.remove(f)

os.rmdir(logs_dir)    

Tensorflow save checkpoint option

# we don’t want to write the meta-graph we use this:
saver.save(sess, 'my-model', global_step=step,write_meta_graph=False)
# If you want to keep only 4 latest models and want to save one model after every 2 hours during training you can use max_to_keep and keep_checkpoint_every_n_hours like this.
saver = tf.train.Saver(max_to_keep=4, keep_checkpoint_every_n_hours=2)

Shuffle Random

sentiment_data = pd.DataFrame()
from sklearn.utils import shuffle
sentiment_data = shuffle(sentiment_data)

Random Normal Generation

import scipy.stats as ss
import numpy as np
import matplotlib.pyplot as plt

x = np.arange(-10, 11)
xU, xL = x + 0.5, x - 0.5 
prob = ss.norm.cdf(xU, scale = 3) - ss.norm.cdf(xL, scale = 3)
prob = prob / prob.sum() #normalize the probabilities so their sum is 1
nums = np.random.choice(x, size = 10000, p = prob)
plt.hist(nums, bins = len(x))

Cache Tutorial

from cache import cache
# If the cache-file already exists then reload it,
# otherwise process all images and save their transfer-values
# to the cache-file so it can be reloaded quickly.
transfer_values = cache(cache_path=cache_path,
                        fn=process_images,
                        data_dir=coco.train_dir,
                        filenames=filenames_train)

searching datetime index

1. index count finds the DateTimeIndex 
dffault['index'] = np.arange(df_merged_test.shape[0])
tempdf = dffault['index'].iloc[int(startday):int(endday)].index
2. set the DateTimeIndex 
dffault.ix[tempdf]

Using Dropout with Keras and LSTM/GRU cell

#In Keras you can specify a dropout layer like this:

model.add(Dropout(0.5))

#But with a GRU cell you can specify the dropout as a parameter in the constructor:

model.add(GRU(units=512,
        return_sequences=True,
        dropout=0.5,
        input_shape=(None, features_size,)))

pandas floor

floored_data = data.apply(np.floor)

signal display

%matplotlib notebook
dftmp = augRLmergeddt[0][2]
dftmp[['Close']].plot()
buyindex = dftmp['signal_5ma'][dftmp['signal_5ma']==9].index
sellindex = dftmp['signal_5ma'][dftmp['signal_5ma']==-9].index

plt.scatter(buyindex,dftmp['Close'][buyindex],c='red')
plt.scatter(sellindex,dftmp['Close'][sellindex],c='green')

Reference

Share on

Twitter Facebook Google+ LinkedIn

Mr Ko

Decide Economic Index

Python Tutorial

Install xlrd

Read Excel

Transpose column

Rename Column

Create DateTimeIndex in Pandas

How to extract specific content in a pandas dataframe with a regex?

Regular expression to extract numbers from a string

Delete column from pandas DataFrame

Data Interpolation

How can I replace all the NaN values with Zero’s in a column of a pandas dataframe

How to add an empty column to a dataframe?

Pandas add one day to column

covert pandas index to datetime index

Pandas: Convert Timestamp to datetime.date

datetime to string with series in python pandas

Python NumPy: Get the values and indices of the elements that are bigger than 10 in a given array

add datetimeindex in the other datetime index

numpy.zeros() in Python

Find the B-spline representation of 1-D curve (Interpolation)

Deleting multiple columns based on column names in Pandas

How to add column to numpy array

Numpy expand dims

Keras: How to save model and continue training?

How to prevent tensorflow from allocating the totality of a GPU memory?

Build Multiple Layer LSTM network

Normalization seq data

numpy arrary to list

Show DataFrame as table in iPython Notebook

Tensor manipulation

Numpy squeeze

matplotlib legend

Adding new column to existing DataFrame in Python pandas

numpy generating random sin curve

calculate turning points / pivot points in trajectory (path)

The Ramer-Douglas-Peucker algorithm implemented in Python

Ramer–Douglas–Peucker algorithm

numpy.asmatrix

numpy.hstack

tensorflow saver and restore

tensorflow global step variable

Search A pandas Column For A Value

Selection and Indexing Methods for Pandas DataFrames

12 Useful Pandas Techniques in Python for Data Manipulation

Pandas Cheat Sheet for Data Science in Python

making matplotlib scatter plots from dataframes in Python’s pandas

Pandas Dataframe: Plot Examples with Matplotlib and Pyplot

scatter plot

pandas access value by index number

pandas zero column creation and insert value

Set value for particular cell in pandas DataFrame using index

Pandas sort

pandas convert string to integer

pandas reset index

How can I fill out a Python string with spaces?

Read all files in the Directory

Moving Average- Pandas

pandas df.pct_change()

In PANDAS, how to get the index of a known value?

matplotlib datetime plot

Python : How to Merge / Join two or more lists

Pandas data copy

PP print example

How to remove specific elements in a numpy array

How To Concatenate Arrays in NumPy?

Pandas merged

Keras load model

tensorflow log level turnoff

ipython notebook module reload

How to deal with SettingWithCopyWarning in Pandas?

python reversed enumerate

tensorflow memory config

Multiple Models load

File delete or remove in directory

Tensorflow save checkpoint option

Shuffle Random

Random Normal Generation

Cache Tutorial