Python设置以及改变工作目录读取数据以及查看数据

公彦栋 2017-09-30

展开全文

os.getcwd()

os.chdir("F:\python_test")##"",''在python是有区别的，另外路径要使用英文的

# Python plotting library
import matplotlib.pyplot as plt
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
 %matplotlib inline

#####文件的读取

shalek2013_expression = pd.read_table('GSE41265_allGenesTPM.txt.gz',                           
                                      index_col=0, 
                                      compression='gzip')
shalek2013_expression.head()###查看
#####设置显示的结果
pd.options.display.max_columns = 50
pd.options.display.max_rows = 50
shalek2013_expression.head()
shalek2013_expression###查看数据的维度
#####读入注释文件
shalek2013_metadata = pd.read_table('~/Downloads/GSE41265_series_matrix.txt', 
                                    skiprows=33, 
                                    index_col=0)
shalek2013_metadata
####转置

shalek2013_metadata = shalek2013_metadata.T
shalek2013_metadata
shalek2013_metadata.index与shalek2013_metadata.columns分别是行名与列名，与R中的rownames,colnames对应
####整理列名
[x.strip('!') for x in shalek2013_metadata.columns]
上面的代码可以用函数做到
def remove_exclamation(x):
    return x.strip('!')
shalek2013_metadata.columns.map(remove_exclamation)
####赋值
shalek2013_metadata.columns = shalek2013_metadata.columns.map(lambda x: x.strip('!'))
shalek2013_metadata.head(8)####显示前8行
####画图并保存图像
sns.boxplot(shalek2013_expression)# gcf = Get current figure
fig = plt.gcf()
fig.savefig('shalek2013_expression_boxplot.pdf')
#####
expression_logged < 10
expression_at_most_10 = expression_logged[expression_logged < 10]
expression_at_most_10
####质控QC，pd操作都是基于列，要想对行做操作需要设置axis=1
genes_of_interest = (expression_logged > 1).sum(axis=1) >= 3
expression_filtered_by_all_samples = expression_logged.loc[genes_of_interest]###行的选择
print(expression_filtered.shape)
expression_filtered.head()
sns.boxplot(expression_filtered_by_all_samples)
# gcf = Get current figure
fig = plt.gcf()
fig.savefig('expression_filtered_by_all_samples_boxplot.pdf')
#####对列（细胞）进行质控
pooled_ids = [x for x in expression_logged.columns if x.startswith('P')] 
###python code 的简洁性pooled = expression_logged[pooled_ids]###默认是列的操作，而行则是要加loc，等同于expression_logged.loc[:, pooled_ids].head()
#######以上的QC均是在所有的基础上，the following code refer to single
single_cell=[x for x in expression_logged.columns if x.startswith('S')]
expression_by_single_cells=expression_logged[single_cell]
gene_select=(expression_by_single_cells>1).sum(axis=1)>3
expression_filtered_by_singles=expression_by_single_cells.loc[gene_select]
Assert expression_filtered_by_singles.shape==(6312, 21)