2011-12-30 63 views
4

當使用prettify時,我的DOCTYPE被分成三行。我怎樣才能保持在一條線上?Node.toprettyxml()在Python中向DOCTYPE添加換行符

「破」 輸出:

<?xml version="1.0" encoding="utf-8"?> 
<!DOCTYPE smil 
    PUBLIC '-//W3C//DTD SMIL 2.0//EN' 
    'http://www.w3.org/2001/SMIL20/SMIL20.dtd'> 
<smil xmlns="http://www.w3.org/2001/SMIL20/Language"> 
    <head> 
    <meta base="rtmp://cp23636.edgefcs.net/ondemand"/> 
    </head> 
    <body> 
    <switch> 
     <video src="mp4:soundcheck/1/clay_aiken/02_sc_ca_sorry_256.mp4" system-bitrate="336000"/> 
     <video src="mp4:soundcheck/1/clay_aiken/02_sc_ca_sorry_512.mp4" system-bitrate="592000"/> 
     <video src="mp4:soundcheck/1/clay_aiken/02_sc_ca_sorry_768.mp4" system-bitrate="848000"/> 
     <video src="mp4:soundcheck/1/clay_aiken/02_sc_ca_sorry_1128.mp4" system-bitrate="1208000"/> 
    </switch> 
    </body> 
</smil> 

腳本:

import csv 
import sys 
import os.path 

from xml.etree import ElementTree 
from xml.etree.ElementTree import Element, SubElement, Comment, tostring 

from xml.dom import minidom 

def prettify(doctype, elem): 
    """Return a pretty-printed XML string for the Element. 
    """ 
    rough_string = doctype + ElementTree.tostring(elem, 'utf-8') 
    reparsed = minidom.parseString(rough_string) 
    return reparsed.toprettyxml(indent=" ", encoding = 'utf-8') 

doctype = '<!DOCTYPE smil PUBLIC "-//W3C//DTD SMIL 2.0//EN" "http://www.w3.org/2001/SMIL20/SMIL20.dtd">' 

video_data = ((256, 336000), 
       (512, 592000), 
       (768, 848000), 
       (1128, 1208000)) 


with open(sys.argv[1], 'rU') as f: 
    reader = csv.DictReader(f) 
    for row in reader: 
     root = Element('smil') 
     root.set('xmlns', 'http://www.w3.org/2001/SMIL20/Language') 
     head = SubElement(root, 'head') 
     meta = SubElement(head, 'meta base="rtmp://cp23636.edgefcs.net/ondemand"') 
     body = SubElement(root, 'body') 

     switch_tag = ElementTree.SubElement(body, 'switch') 

     for suffix, bitrate in video_data: 
      attrs = {'src': ("mp4:soundcheck/{year}/{id}/{file_root_name}_{suffix}.mp4" 
          .format(suffix=str(suffix), **row)), 
        'system-bitrate': str(bitrate), 
        } 
      ElementTree.SubElement(switch_tag, 'video', attrs) 

     file_root_name = row["file_root_name"] 
     year = row["year"] 
     id = row["id"] 
     path = year+'-'+id 

     file_name = row['file_root_name']+'.smil' 
     full_path = os.path.join(path, file_name) 
     output = open(full_path, 'w') 
     output.write(prettify(doctype, root)) 

回答

2

查看了你當前的腳本和你在這個主題上提出的其他問題後,我認爲你可以通過使用字符串操作構建你的smil文件來讓你的生活變得更簡單。

幾乎文件中的所有xml都是靜態的。您需要擔心正確處理的唯一數據是video標記的屬性值。爲此,標準庫中有一個方便的功能可以完全滿足您的需求:xml.sax.saxutils.quoteattr

因此,在考慮到這些點,這裏是應該有很多更容易使用的腳本:

import sys, os, csv 
from xml.sax.saxutils import quoteattr 

smil_header = '''\ 
<?xml version="1.0" encoding="utf-8"?> 
<!DOCTYPE smil PUBLIC "-//W3C//DTD SMIL 2.0//EN" "http://www.w3.org/2001/SMIL20/SMIL20.dtd"> 
<smil xmlns="http://www.w3.org/2001/SMIL20/Language"> 
    <head> 
    <meta base="rtmp://cp23636.edgefcs.net/ondemand"/> 
    </head> 
    <body> 
    <switch> 
''' 
smil_video = '''\ 
     <video src=%s system-bitrate=%s/> 
''' 
smil_footer = '''\ 
    </switch> 
    </body> 
</smil> 
''' 

src_format = 'mp4:soundcheck/%(year)s/%(id)s/%(file_root_name)s_%(suffix)s.mp4' 

video_data = (
    ('256', '336000'), ('512', '592000'), 
    ('768', '848000'), ('1128', '1208000'), 
    ) 

root = os.getcwd() 
if len(sys.argv) > 2: 
    root = sys.argv[2] 

with open(sys.argv[1], 'rU') as stream: 

    for row in csv.DictReader(stream): 
     smil = [smil_header] 
     for suffix, bitrate in video_data: 
      row['suffix'] = suffix 
      smil.append(smil_video % (
       quoteattr(src_format) % row, quoteattr(bitrate) 
       )) 
     smil.append(smil_footer) 

     directory = os.path.join(root, '%(year)s-%(id)s' % row) 
     try: 
      os.makedirs(directory) 
     except OSError: 
      pass 
     path = os.path.join(directory, '%(file_root_name)s.smil' % row) 
     print ':: writing file:', path 
     with open(path, 'wb') as stream: 
      stream.write(''.join(smil)) 
2

我想你至少有三種選擇:

  1. 只要接受了換行。他們可能是不受歡迎和醜陋的,但他們是完全合法的。

  2. 添加一個用更好的替換壞DOCTYPE的kludge。也許是這樣的:

    import re 
    
    pretty_xml = prettify(doctype, elem) 
    m = re.search("(<!.*dtd'>)", pretty_xml, re.DOTALL) 
    ugly_doctype = m.group() 
    fixed_xml = pretty_xml.replace(ugly_doctype, doctype) 
    
  3. 使用更多功能豐富的XML包。 lxml想起來;它大部分與ElementTree兼容。通過使用lxml的tostring函數,您將不需要0​​函數,並且DOCTYPE按照您的需要出現。例如:

    from lxml import etree 
    
    doctype = '<!DOCTYPE smil PUBLIC "-//W3C//DTD SMIL 2.0//EN" "http://www.w3.org/2001/SMIL20/SMIL20.dtd">' 
    
    XML = '<smil xmlns="http://www.w3.org/2001/SMIL20/Language"><head><meta base="rtmp://cp23636.edgefcs.net/ondemand"/></head><body><switch><video src="mp4:soundcheck/1/clay_aiken/02_sc_ca_sorry_256.mp4" system-bitrate="336000"/><video src="mp4:soundcheck/1/clay_aiken/02_sc_ca_sorry_512.mp4" system-bitrate="592000"/><video src="mp4:soundcheck/1/clay_aiken/02_sc_ca_sorry_768.mp4" system-bitrate="848000"/><video src="mp4:soundcheck/1/clay_aiken/02_sc_ca_sorry_1128.mp4" system-bitrate="1208000"/></switch></body></smil>' 
    
    elem = etree.fromstring(XML) 
    print etree.tostring(elem, doctype=doctype, pretty_print=True, 
            xml_declaration=True, encoding="utf-8") 
    

    輸出:

    <?xml version='1.0' encoding='utf-8'?> 
    <!DOCTYPE smil PUBLIC "-//W3C//DTD SMIL 2.0//EN" "http://www.w3.org/2001/SMIL20/SMIL20.dtd"> 
    <smil xmlns="http://www.w3.org/2001/SMIL20/Language"> 
        <head> 
        <meta base="rtmp://cp23636.edgefcs.net/ondemand"/> 
        </head> 
        <body> 
        <switch> 
         <video src="mp4:soundcheck/1/clay_aiken/02_sc_ca_sorry_256.mp4" system-bitrate="336000"/> 
         <video src="mp4:soundcheck/1/clay_aiken/02_sc_ca_sorry_512.mp4" system-bitrate="592000"/> 
         <video src="mp4:soundcheck/1/clay_aiken/02_sc_ca_sorry_768.mp4" system-bitrate="848000"/> 
         <video src="mp4:soundcheck/1/clay_aiken/02_sc_ca_sorry_1128.mp4" system-bitrate="1208000"/> 
        </switch> 
        </body> 
    </smil> 
    
0

我不相信這是可以去除由Node.toprettyxml產生的新行的DOCTYPE,至少在一個Python化的方式。

這是DocumentType類的writexml方法,該方法從minidom module的第1284行開始,該行插入違規的換行符。插入的換行符字符串最初來自Node.toprettyxml方法,並通過Document類的writexml方法傳遞。相同的換行符字符串也被傳遞給Node的各種其他子類的writexml方法。將呼叫中的換行符字符串更改爲Node.prettyxml將更改在輸出的XML中使用的換行符字符串。

這種情況有解決各種哈克方式:修改本地的minidom模塊,「猴子補丁」的DocumentType類或後處理XML字符串的方法writexml副本刪除不需要的換行符。但是,這些方法都沒有吸引我。

對我而言,最好的辦法似乎是讓事情保持原樣。將DOCTYPE分成多行是否真的是一個嚴重問題?