- 論壇徽章:
- 0
|
RSS閱讀,訂閱網(wǎng)址在main.py的Producer中
多線程部分參考了http://python.jobbole.com/81690/ 的帖子
rss分析器- #!usr/bin/env python
- # -*- coding:UTF-8 -*-
-
- import re
- from lxml import etree
- from bs4 import BeautifulSoup as sp
- import requests
- import urllib2
- import StringIO
-
- import sys
- reload(sys)
- sys.setdefaultencoding("utf-8")
-
- headers={'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
-
- def urlread(url):
- try:
- req=requests.get(url,headers=headers)
- req.encoding="utf-8"
- return req.text.encode("utf-8")
- except:
- req=urllib2.Request(url,headers=headers)
- response=urllib2.urlopen(req)
- return response.read().encode("utf-8")
-
- class Item:
- def __init__(self,title,link,date,description):
- self.title=title.strip()
- self.link=link.strip()
- self.pubDate=date.strip()
- self.decription=self.filter(description).strip()
-
- def filter(self,description):
- description=re.sub("<.*?>",'',description)
- description=re.sub("\r",'',description)
- description=re.sub("\n",'',description)
- description=re.sub(" "," ",description)
- if len(description)>240:
- description=description[:240]+'...'
- return description
-
- def __str__(self):
- return "%s\n%s\n%s\n<%s>\n" % (
- self.title,
- self.link,
- self.decription,
- self.pubDate
- )
-
- __repr__=__str__
-
- class BSParser(object):
- #url=''
- def __init__(self,url):
- xml=urlread(url)
- self.reset(xml)
-
- def reset(self,xml=None):
- if xml==None:
- self.soup=sp("<xml> </xml>")
- else:
- self.soup=sp(xml,"xml")
-
- def callback(self,method,obj,tags):
- rst=None
- attr=method.lower()
-
- for tag in tags:
- try:
- rst=getattr(obj,attr)(tag)
- except:
- continue
- if rst:
- break
- return rst
-
- def getfields(self,tags=["item",'entry']):
- return self.callback(method="FIND_ALL",
- obj=self.soup,
- tags=tags)
-
- def gettitle(self,obj,tags=["title"]):
- return self.callback("FIND",obj,tags).text
-
- def getlink(self,obj,tags=["link"]):
- rst=self.callback("FIND",obj,tags).text
- if not rst:
- rst=self.callback("FIND",obj,tags).get("href")
- return rst
-
- def getdate(self,obj,tags=["pubDate","published"]):
- return self.callback("FIND",obj,tags).text
-
- def getdescription(self,obj,tags=["description","content"]):
- return self.callback("FIND",obj,tags).text
-
- def run(self):
- for item in self.getfields():
- title=self.gettitle(item)
- link=self.getlink(item)
- date=self.getdate(item)
- description=self.getdescription(item)
- newsitem=Item(title,link,date,description)
- yield newsitem
-
- def test():
- parser=Parser()
- for item in parser.run():
- print item
-
- if __name__=="__main__":
- test()
復(fù)制代碼 多線程運(yùn)行- #!usr/bin/env python
- #coding=utf8
-
- import threading
- #import time
- import rss
- import Queue
- #import requests
- import output
- from Tkinter import END
-
-
- import sys
- reload(sys)
- sys.setdefaultencoding( "utf-8" )
-
-
- class Manual:
- def __init__(self):
- self.results=[]
- self.lock=threading.RLock()
-
- def put(self,results):
- self.lock.acquire()
- self.results.append(results)
- self.lock.release()
-
- class Consumer(threading.Thread):
- def __init__(self,queue,manual):
- threading.Thread.__init__(self)
- self._queue=queue
- self.manual=manual
- #self.parser=parser
-
- def run(self):
- while True:
- feed=self._queue.get()
- if feed=="quit":
- break
- for item in rss.BSParser(feed).run():
- self.manual.put(item)
-
- def Producer():
-
- queue=Queue.Queue()
- manual=Manual()
-
- feeds=['http://rss.cnbeta.com/rss',
- "http://www.36kr.com/feed",
- #"http://www.ftchinese.com/rss/feed",
- "http://python.jobbole.com/feed",
- "http://feed.cnblogs.com/blog/sitecateogry/python/rss",
- "http://feed.yeeyan.org/select",
- "http://blog.sina.com.cn/rss/1286528122.xml"
- ]
-
- worker_threads=build_worker_pool(queue,manual,4)
-
- for feed in feeds:
- queue.put(feed)
-
- for item in worker_threads:
- queue.put("quit")
-
- for worker in worker_threads:
- worker.join()
-
- return manual.results
-
- def build_worker_pool(queue,manual,size):
-
- workers=[]
- for _ in range(size):
- worker=Consumer(queue,manual)
- worker.start()
- workers.append(worker)
- return workers
-
- if __name__=="__main__":
- rst=Producer()
- outputlist=[str(item) for item in rst]
- tk=output.OutputBoard()
- tk.insert(END,outputlist)
- tk.mainloop()
復(fù)制代碼 TK輸出顯示界面- #!usr/bin/env python
- #coding=utf-8
-
- from Tkinter import *
- from ScrolledText import ScrolledText
-
- class TextBoard(ScrolledText):
- def __init__(self,parent):
- ScrolledText.__init__(self,parent)
- self.bind("<Control-Key-a>", self.selectText) # 快捷鍵
- self.bind("<Control-Key-A>", self.selectText) # 快捷鍵
-
- def selectText(self,event):
- self.tag_add(SEL,'1.0',END)
- return 'break'
-
- class OutputBoard:
- def __init__(self):
- self.tk=Tk()
- #self.tk.pack()
-
- self.TBoard=TextBoard(self.tk)
- self.TBoard.pack(expand=YES, fill=BOTH) #mark here,ScrolledText欄隨窗口大小變化
-
- #def insert(self,pos,text):
- # method=getattr(self.TBoard,'insert')
- # method(pos,text)
-
- def __getattr__(self,attr):
- return getattr(self.TBoard,attr)
-
- def insert(self,pos,obj):
- objparser=ObjParser()
- self.TBoard.insert(pos,objparser.run(obj))
-
- class ObjParser:
-
- stack=[]
- intend=' '*4
-
- def line_intend(self,level=0):
- if not self.stack:
- return self.intend*level
- else:
- return "\n"+self.intend*level
-
- def parse(self,obj,level=0):
- if obj is None:
- self.stack.append('')
- elif isinstance(obj,(int,float,long)):
- self.stack.append(str(obj))
- elif isinstance(obj,str):
- self.stack.append(obj)
- elif isinstance(obj,(list,tuple)):
- self.parse_list(obj,level)
- elif isinstance(obj,dict):
- self.parse_dict(obj,level)
-
- def parse_list(self,obj,level):
- level+=1
- for item in obj:
- if isinstance(item,(tuple,list)):
- self.stack.append(self.line_intend(level)+"\t".join(item))
- else:
- self.parse(item,level)
- self.stack.append('\n')
- self.stack.append(self.line_intend(level-1))
-
- def parse_dict(self,obj,level):
- level+=1
- for k,v in obj:
- self.stack.append(self.line_intend(level)+str(k)+":")
- self.parse(v,level)
- self.stack.append(",")
- self.stack.append(self.line_intend(level-1))
-
- def run(self,obj):
- self.parse(obj,0)
- return ''.join(self.stack)
復(fù)制代碼 |
|