сложный анализ Python и формирование данных
Я получаю данные с сервера следующим образом
dataProducer00 ==> Collected: 37402 Answer (MQ): 6234 Text/s: 12467 Text/s[3s]: 12467 lined: 0/75 liveData: n/a diff: n/a neardiff: n/a diffSeq: n/a
dataProducer01 ==> Collected: 45697 Answer (MQ): 7617 Text/s: 15232 Text/s[3s]: 15232 lined: 0/85 liveData: n/a diff: n/a neardiff: n/a diffSeq: n/a
dataProducer02 ==> Collected: 55936 Answer (MQ): 9326 Text/s: 18645 Text/s[3s]: 18645 lined: 0/121 liveData: n/a diff: n/a neardiff: n/a diffSeq: n/a
dataCleaner00[00] ==> Collected: 0 Answer: 0 Ratio: 0 Text/s: 0 Text/s[2s]: 0 lined: 0/0 liveData: 48042 diff: 0 neardiff: 0 diffSeq: 0
dataCleaner00[11] ==> Collected: 65214 Answer: 34567 Ratio: 0.53 Text/s: 17283 Text/s[2s]: 17283 lined: 7/15 liveData: 48042 diff: 0 neardiff: 0 diffSeq: 0\
dataCleaner00[23] ==> Collected: 2175 Answer (MQ): 543 Ratio: 0.25 Text/s: 271 Text/s[2s]: 271 lined: 0/15 liveData: 48042 diff: 0 neardiff: 0 diffSeq: 0\
MACH0_CA_CFE_01_A ==> Collected: 0 breaks: 0 lined: 0/0 obtained: 0/0
MACH0_CA_MEC_AR_01_A ==> Collected: 8248 breaks: 0 lined: 10/16 obtained: 1/4
MACH0_CA_MEC_AR_02_A ==> Collected: 648 breaks: 0 lined: 1/16 obtained: 1/3
MACH2_CA_MEC_ITC_01_A ==> Collected: 0 breaks: 0 lined: 0/0 obtained: 0/0
MACH2_CA_TAP_01_A ==> Collected: 0 breaks: 0 lined: 0/0 obtained: 0/0
MACH2_CA_TAP_AR_01_A ==> Collected: 0 breaks: 0 lined: 0/0 obtained: 0/0
MACH3_FI_A ==> Collected: 0 breaks: 0 lined: 0/0 obtained: 0/0
Reader_03_t01 ==> Collected: 114700 Drops: 0 lined: 0/832 ErrorDown: 0 Lined: 0,0,0,0,0,0
Reader_03_t02 ==> Collected: 434708 Drops: 0 lined: 0/49168 ErrorDown: 0 Lined: 0,0,0,0,0,0
и я хочу, чтобы результат был следующим
{
"Collected": {
"dataProducer00" : 374002,
..
"dataCleaner00" : 0,
..
"MACH0_CA_CFE_01_A" : 0,
"MACH0_CA_MEC_AR_01_A" : 8248,
"MACH0_CA_MEC_AR_02_A" : 648,
.. },
"Answer (MQ)": {
"dataProducer00" : 6234,
"dataProducer01" : 7617,
"dataProducer02" : 9326,
"dataCleaner02" : 543
},
"Answer": {
"dataCleaner00": 0,
"dataCleaner01": 34567
},
"Text/s": {
"dataProducer00" : 12467,
"dataProducer01" : 15232,
..
"dataCleaner00" : 0,
"dataCleaner01" : 17283,
..
},
"lined": {
"dataProducer00" : 0,
"dataProducer01" : 0,
"dataProducer02" : 0,
.. },
"lined_Top": {
"dataProducer00" : 75,
"dataProducer01" : 85,
"dataProducer02" : 121,
"dataCleaner00" : 0,
"dataCleaner01" : 15,
"dataCleaner02" : 15,
"MACH0_CA_CFE_01_A" : 0,
"MACH0_CA_MEC_AR_01_A" : 16,
"MACH0_CA_MEC_AR_02_A" : 16,
"MACH2_CA_MEC_ITC_01_A": 0,
"MACH2_CA_TAP_01_A": 0,
"MACH2_CA_TAP_AR_01_A": 0,
"MACH3_FI_A": 0
},
"liveData": {
"dataProducer00" : "n/a",
..
"dataCleaner02" : 48042
},
"breaks": {
"MACH0_CA_MEC_AR_01_A": 0,
..
"MACH2_CA_TAP_AR_01_A": 0
},
"Ratio": {
"dataCleaner00": 0,
"dataCleaner01": 0.53,
"dataCleaner02": 0.25
},
"obtained": {
"MACH0_CA_MEC_AR_01_A": 0,
"MACH0_CA_MEC_AR_02_A": 1,
..},
"obtained_Top": {
"MACH0_CA_MEC_AR_01_A": 0,
"MACH0_CA_MEC_AR_02_A": 4,
"MACH2_CA_MEC_ITC_01_A": 3,
.. "MACH3_FI_A": 0
},
"diff": {
"dataProducer00": "n/a",
.., "dataCleaner02" : 0
},
"neardiff": {
"dataProducer00": "n/a",
.. "dataCleaner00" : 0,
},
"diffSeq": {
"dataProducer00": "n/a",
.. "dataCleaner02" : 0
}
}
Я провожу время в цикле, мне ничего не помогает. Полученные данные должны быть получены через генератор, чтобы мне было легче их окончательно обработать.
Можем ли мы сделать это с помощью простого регулярного выражения или нескольких разбиений, существует ли какой-либо общий способ обработки всех типов сложных данных.
Это частично рабочий код, в котором мне нужно отправить сообщение и снова изменить формат.
lines = data.strip().split('\n')
output = {}
field_regex = r'(\w+)\s*:\s*(\d+)'
#field_regex = r'(\w+)\s*:\s*'
field_pattern = re.compile(field_regex)
for line in lines:
match = re.match(r'^(.*?) --> (.*)', line)
if match:
name = match.group(1)
fields = match.group(2)
field_matches = re.findall(field_pattern, fields)
output[name] = {field: int(value) for field, value in field_matches}
print(output)
Мне также нужны следующие предметы дополнительно.
- пропустить ErrorDown
- чтобы выровнять как выровненный": { "dataProducer00" : 0, "dataProducer01" : 0, "dataProducer02" : 0, "Reader_03_t01" : [0,0,0,0,0], "Reader_03_t02" : [0,0 ,0,0,0] .. },
- Получено для разделения как получено_Max и получено следующим образом: "получено": { "MACH0_CA_MEC_AR_01_A": 0, "MACH0_CA_MEC_AR_02_A": 1, ..}, "obtained_Top": { "MACH0_CA_MEC_AR_01_A": 0, "MACH0_CA_MEC_AR_02_A": 4, " MACH2_CA_MEC_ITC_01_A ": 3, .. "MACH3_FI_A": 0 },
1 ответ
Если у вас возникнет желание использовать pandas :
#pip install pandas
import pandas as pd
pattern = (
r"(?:(?P<outer_key>.+?)\s+==>\s+)?" # e.g. Collected, Answer (MQ),..
"(?P<inner_key>.+?):" # e.g. dataProducer00, dataCleaner00[11],..
"\s+(?P<values>[\dn/a.,]+)" # e.g. 37402, 0/75, n/a,..
)
out = (
pd.read_csv("file.txt", header=None, sep="|")
.squeeze().str.extractall(pattern).ffill()
.query("values != 'n/a' and not inner_key.str.contains('ErrorDown')")
.reset_index(drop=True).pipe(lambda x: x.groupby(
x["inner_key"].str.strip().str.lower(), sort=False)
.apply(lambda g: dict(zip(g["outer_key"],
pd.to_numeric(g["values"], errors="ignore")))).to_dict())
)
Выход :
{
"collected": {
"dataProducer00": 37402,
"dataProducer01": 45697,
"dataProducer02": 55936,
"dataCleaner00[00]": 0,
"dataCleaner00[11]": 65214,
"dataCleaner00[23]": 2175,
"MACH0_CA_CFE_01_A": 0,
"MACH0_CA_MEC_AR_01_A": 8248,
"MACH0_CA_MEC_AR_02_A": 648,
"MACH2_CA_MEC_ITC_01_A": 0,
"MACH2_CA_TAP_01_A": 0,
"MACH2_CA_TAP_AR_01_A": 0,
"MACH3_FI_A": 0,
"Reader_03_t01": 114700,
"Reader_03_t02": 434708
},
"answer (mq)": {
"dataProducer00": 6234,
"dataProducer01": 7617,
"dataProducer02": 9326,
"dataCleaner00[23]": 543
},
"text/s": {
"dataProducer00": 12467,
"dataProducer01": 15232,
"dataProducer02": 18645,
"dataCleaner00[00]": 0,
"dataCleaner00[11]": 17283,
"dataCleaner00[23]": 271
},
"text/s[3s]": {
"dataProducer00": 12467,
"dataProducer01": 15232,
"dataProducer02": 18645
},
"lined": {
"dataProducer00": "0/75",
"dataProducer01": "0/85",
"dataProducer02": "0/121",
"dataCleaner00[00]": "0/0",
"dataCleaner00[11]": "7/15",
"dataCleaner00[23]": "0/15",
"MACH0_CA_CFE_01_A": "0/0",
"MACH0_CA_MEC_AR_01_A": "10/16",
"MACH0_CA_MEC_AR_02_A": "1/16",
"MACH2_CA_MEC_ITC_01_A": "0/0",
"MACH2_CA_TAP_01_A": "0/0",
"MACH2_CA_TAP_AR_01_A": "0/0",
"MACH3_FI_A": "0/0",
"Reader_03_t01": "0,0,0,0,0,0",
"Reader_03_t02": "0,0,0,0,0,0"
},
"answer": {
"dataCleaner00[00]": 0,
"dataCleaner00[11]": 34567
},
"ratio": {
"dataCleaner00[00]": 0.0,
"dataCleaner00[11]": 0.53,
"dataCleaner00[23]": 0.25
},
"text/s[2s]": {
"dataCleaner00[00]": 0,
"dataCleaner00[11]": 17283,
"dataCleaner00[23]": 271
},
"livedata": {
"dataCleaner00[00]": 48042,
"dataCleaner00[11]": 48042,
"dataCleaner00[23]": 48042
},
"diff": {
"dataCleaner00[00]": 0,
"dataCleaner00[11]": 0,
"dataCleaner00[23]": 0
},
"neardiff": {
"dataCleaner00[00]": 0,
"dataCleaner00[11]": 0,
"dataCleaner00[23]": 0
},
"diffseq": {
"dataCleaner00[00]": 0,
"dataCleaner00[11]": 0,
"dataCleaner00[23]": 0
},
"breaks": {
"MACH0_CA_CFE_01_A": 0,
"MACH0_CA_MEC_AR_01_A": 0,
"MACH0_CA_MEC_AR_02_A": 0,
"MACH2_CA_MEC_ITC_01_A": 0,
"MACH2_CA_TAP_01_A": 0,
"MACH2_CA_TAP_AR_01_A": 0,
"MACH3_FI_A": 0
},
"obtained": {
"MACH0_CA_CFE_01_A": "0/0",
"MACH0_CA_MEC_AR_01_A": "1/4",
"MACH0_CA_MEC_AR_02_A": "1/3",
"MACH2_CA_MEC_ITC_01_A": "0/0",
"MACH2_CA_TAP_01_A": "0/0",
"MACH2_CA_TAP_AR_01_A": "0/0",
"MACH3_FI_A": "0/0"
},
"drops": {
"Reader_03_t01": 0,
"Reader_03_t02": 0
}
}