import json
import pandas as pd
import sys
import numpy as np
import re
class handle_data():
def transfer_to_list(self, string): #转化成列表格式
string = str(string)
string = string.replace(" ", ",")
string = string.replace("\t", ",")
string = string.replace("\n", "")
# string = string.replace("""{\'\\\'""","").replace("""\\\'\'}""","").replace('\\','').replace('''\'''','').replace("""{\'\\\\\\'""","")\
# .replace("""\\\\\\""","")
string = string.replace("""\\\\\\""","").replace("'{","").replace("}'","").replace('""','"')
string = string.split(',')
return string
def transfer_to_str(self, df): #转化成字符串格式
file = ''
for n in range(len(df)):
a = df.iloc[n]
a = str(tuple(a))
a = a.replace('(', '')
a = a.replace(')', '')
a = a.replace('"', '')
a = a.replace("'", "")
a = a.replace(" ","")
if a.find(':') > 0:
try:
a=json.loads(a)
except:
pass
else:
a = a.replace(",","\t")
if n < len(df) - 1:
file = file + a + '\n'
else:
file = file + a
return file
def data_cleaning(self, df, argv1=1, value=999999):
'''
:param df: 读取的数据
:param argv1: 参数如果是1;参数如果是别的;
:param value: 把空的值换成999999;空的值替换成自己的
:return:
'''
file = list(map(handle_data().transfer_to_list, df))
file = pd.DataFrame(file)
try: # 删除重复值
file.drop_duplicates(inplace=True)
except:
pass
if argv1 == 2:
try:
file.replace('', value, inplace=True)
except:
pass
try:
file.fillna(value=value, inplace=True)
except:
pass
file = handle_data().transfer_to_str(file)
return file
def main(self):
try:
df = sys.stdin
if len(sys.argv) == 1&nbs***bsp;len(sys.argv) > 3&nbs***bsp;\
(len(sys.argv) == 2 and int(sys.argv[1]) > 2)&nbs***bsp;\
((len(sys.argv) == 3) and int(sys.argv[1]) > 2):
df = handle_data().data_cleaning(df)
elif len(sys.argv) == 2:
argv1 = int(sys.argv[1])
df = handle_data().data_cleaning(df, argv1)
else:
argv1 = int(sys.argv[1])
value = int(sys.argv[2])
df = handle_data().data_cleaning(df, argv1, value)
# df = handle_data().processing_special_columns(df)
print(df)
except:
pass
if __name__ == '__main__':
handle_data().main()
运行脚本之后数据可以清洗出来但是类型不对,写不进去hive里,报错FAILED: SemanticException [Error 10044]: Line 1:18 Cannot insert into target table because column number/types are different 'waybill_base_info': Cannot convert column 224 from string to map<string,string>.
全部评论
(0) 回帖