2016-05-13 84 views
6

我試圖將幾個csv文件中的值分組到XML文件(groups.xml)中的bin中。我有以下的代碼工作在一定程度上,但是沒有得到什麼,我期待:使用熊貓剪切分組值

import os, sys 
import glob 
import pandas as pd 
import xml.etree.cElementTree as ET 

def xml_parse(): 
    try: 
     os.chdir("path/to/files") 
     filename = [file1 for file1 in glob.glob("*.csv")] 
     filename = [i.split('.', 1)[0] for i in filename] 
     #filename = '\n'.join(filename) 
     os.chdir('..') 
     output = [] 
     doc = ET.parse("groups.xml").getroot() 
     for root_ele in doc.findall('Groups'): 
      tag_ele = root_ele.find('GroupID').text 
      for name in filename: 
       if name == tag_ele.lower(): 
        for root_ele1 in root_ele.findall('groupname'): 
         displayname = root_ele1.find('Name').text 
         minval = root_ele1.find('min').text 
         mininc = root_ele1.find('minInc').text 
         maxvalue = root_ele1.find('max') 
         maxinclusive = root_ele1.find('maxInc') 
         lists = [] 
         frame = pd.DataFrame() 
         fname = "path/to/files" + name + ".csv" 
         df = pd.read_csv(fname, index_col=None, header=None) 
         lists.append(df) 
         frame = pd.concat(lists) 
         if maxvalue is not None: 
          maxval = maxvalue.text 
          if maxinclusive is not None: 
           maxinc = maxinclusive.text 
           df['bin'] = pd.cut(frame[1], [float(minval),float(maxval)], right= maxinc, include_lowest= mininc) 
           out = str(pd.concat([df['bin'], frame[1]], axis=1)) 
           out = out.split("\n")[2:] 
           for a in out: 
            print a 
         else: 
          df['bin'] = pd.cut(frame[1], [float(minval)], include_lowest= mininc) 
          out = str(pd.concat([df['bin'], frame[1]], axis=1)) 
          out = out.split("\n")[2:] 
          for a in out: 
           print a 
      break 
    except AttributeError: 
     pass 

電流輸出:

1 NaN 10.18 
2 NaN 25.16 
3 NaN 44.48 
4 NaN 85.24 
5 NaN 36.71 
6 NaN 77.09 
7 NaN 81.88 
8 NaN 22.92 
9 NaN 44.31 
10 NaN 15.79 
1 [10, 18] 10.18 
2  NaN 25.16 
3  NaN 44.48 
4  NaN 85.24 
5  NaN 36.71 
6  NaN 77.09 
7  NaN 81.88 
8  NaN 22.92 
9  NaN 44.31 
10 [10, 18] 15.79 
1  NaN 10.18 
2 [18, 35] 25.16 
3  NaN 44.48 
4  NaN 85.24 
5  NaN 36.71 
6  NaN 77.09 
7  NaN 81.88 
8 [18, 35] 22.92 
9  NaN 44.31 
10  NaN 15.79 
1  NaN 10.18 
2  NaN 25.16 
3 [35, 50] 44.48 
4  NaN 85.24 
5 [35, 50] 36.71 
6  NaN 77.09 
7  NaN 81.88 
8  NaN 22.92 
9 [35, 50] 44.31 
10  NaN 15.79 
1 NaN 10.18 
2 NaN 25.16 
3 NaN 44.48 
4 NaN 85.24 
5 NaN 36.71 
6 NaN 77.09 
7 NaN 81.88 
8 NaN 22.92 
9 NaN 44.31 
10 NaN 15.79 
1 NaN 10.18 
2 NaN 25.16 
3 NaN 44.48 
4 NaN 85.24 
5 NaN 36.71 
6 NaN 77.09 
7 NaN 81.88 
8 NaN 22.92 
9 NaN 44.31 
10 NaN 15.79 

一個錯誤:

Traceback (most recent call last): 
    File "groups.py", line 69, in <module> 
    xml_parse() 
    File "groups.py", line 44, in xml_parse 
    df['bin'] = pd.cut(frame[1], [float(minval)], include_lowest= mininc) 
    File "C:\Python27\lib\site-packages\pandas\tools\tile.py", line 113, in cut 
    include_lowest=include_lowest) 
    File "C:\Python27\lib\site-packages\pandas\tools\tile.py", line 203, in _bins_to_cuts 
    include_lowest=include_lowest) 
    File "C:\Python27\lib\site-packages\pandas\tools\tile.py", line 252, in _format_levels 
    levels[0] = '[' + levels[0][1:] 
IndexError: list index out of range 

預期產出:

1 [10, 18] 10.18 
2 [18, 35] 25.16 
3 [35, 50] 44.48 
4 [>= 75] 85.24 #however >=75 can be represented 
5 [35, 50] 36.71 
6 [>= 75] 77.09 
7 [>= 75] 81.88 
8 [18, 35] 22.92 
9 [35, 50] 44.31 
10 [10, 18] 15.79 

回答

3

與開始:

df: 

    val1 val2 
0  NaN 10 
1 10.18  1 
2 25.16  1 
3 44.48  1 
4 85.24  1 
5 36.71  1 
6 77.09  1 
7 81.88  1 
8 22.92  1 
9 44.31  1 
10 15.79  1 

xml = """ 
<metaGroups> 
    <Groups> 
     <GroupID>age</GroupID> 
     <description>age</description> 
     <groupname> 
      <Name>0 - &lt;10</Name> 
      <min>0</min> 
      <minInc>TRUE</minInc> 
      <max>10</max> 
      <maxInc>FALSE</maxInc> 
     </groupname> 
     <groupname> 
      <Name>10 - &lt;18</Name> 
      <min>10</min> 
      <minInc>TRUE</minInc> 
      <max>18</max> 
      <maxInc>FALSE</maxInc> 
     </groupname> 
     <groupname> 
      <Name>18 - &lt;35</Name> 
      <min>18</min> 
      <minInc>TRUE</minInc> 
      <max>35</max> 
      <maxInc>FALSE</maxInc> 
     </groupname> 
     <groupname> 
      <Name>35 - &lt;50</Name> 
      <min>35</min> 
      <minInc>TRUE</minInc> 
      <max>50</max> 
      <maxInc>FALSE</maxInc> 
     </groupname> 
     <groupname> 
      <Name>50 - &lt;65</Name> 
      <min>50</min> 
      <minInc>TRUE</minInc> 
      <max>65</max> 
      <maxInc>FALSE</maxInc> 
     </groupname> 
     <groupname> 
      <Name>65 - &lt;75</Name> 
      <min>65</min> 
      <minInc>TRUE</minInc> 
      <max>75</max> 
      <maxInc>FALSE</maxInc> 
     </groupname> 
     <groupname> 
      <Name>&amp;ge;75</Name> 
      <min>75</min> 
      <minInc>TRUE</minInc> 
     </groupname> 
    </Groups> 
</metaGroups> 
""" 

您可以使用BeautifulSoup提取bin參數,構建標籤和應用pd.cut()

from bs4 import BeautifulSoup as Soup 
from itertools import chain 

soup = Soup(xml, 'html.parser') 

bins = [] 
for message in soup.findAll('groupname'): 
    min = message.find('min').text 
    try: 
     max = message.find('max').text 
     bins.append([min, max]) 
    except: 
     bins.append([min]) # For max bin 

在這一點上,我們

bins 

[['0', '10'], ['10', '18'], ['18', '35'], ['35', '50'], ['50', '65'], ['65', '75'], ['75']] 

接下來,我們將壓扁的listlist,擺脫重複的,添加的上限:

labels = bins 
bins = list(np.unique(np.fromiter(chain.from_iterable(bins), dtype='int'))) 
last = bins[-1] 
bins.append(int(df.val1.max() + 1)) 

這將產生:

[0, 10, 18, 35, 50, 65, 75, 86] 

構建標籤:

labels = ['[{0} - {1}]'.format(label[0], label[1]) if len(label) > 1 else '[ > {} ]'.format(label[0]) for label in labels] 

並使用pd.cut()

df['binned'] = pd.cut(df.val1, bins=bins, labels=labels) 

生產:

 val1 val2  binned 
1 10.18  1 [10 - 18] 
2 25.16  1 [18 - 35] 
3 44.48  1 [35 - 50] 
4 85.24  1 [>= 75] 
5 36.71  1 [35 - 50] 
6 77.09  1 [>= 75] 
7 81.88  1 [>= 75] 
8 22.92  1 [18 - 35] 
9 44.31  1 [35 - 50] 
10 15.79  1 [10 - 18] 
+0

謝謝你的答案。 '垃圾箱'對我來說沒有任何回報。 – pam

+0

我沒有包含'xml'字符串,請參閱更新的答案。 – Stefan

+0

我用我的xml文件。看起來我沒有正確閱讀。它的工作現在。萬分感謝! – pam