施磊磊的核心技术全自动抓取热门主题以及转推引用回复用户的最新微博.docx
- 文档编号:28769054
- 上传时间:2023-07-19
- 格式:DOCX
- 页数:18
- 大小:19.74KB
施磊磊的核心技术全自动抓取热门主题以及转推引用回复用户的最新微博.docx
《施磊磊的核心技术全自动抓取热门主题以及转推引用回复用户的最新微博.docx》由会员分享,可在线阅读,更多相关《施磊磊的核心技术全自动抓取热门主题以及转推引用回复用户的最新微博.docx(18页珍藏版)》请在冰豆网上搜索。
施磊磊的核心技术全自动抓取热门主题以及转推引用回复用户的最新微博
importtwitter
defoauth_login():
#XXX:
Gototocreateanappandgetvalues
#forthesecredentialsthatyou'llneedtoprovideinplaceofthese
#emptystringvaluesthataredefinedasplaceholders.
#Seeformoreinformation
#onTwitter'sOAuthimplementation.
CONSUMER_KEY='g5f16YxsEPQfCnpqMSAYbQ9yf'
CONSUMER_SECRET='t8x1Nnw2FqMh4jvxt3AwkoyoXNMLcTytc9ZVdYuEmT5VQtbJlt'
OAUTH_TOKEN='3538463899-iYmdEPgxptG6gljAtpKnKDPEwFS8ZlbM9SkHE91'
OAUTH_TOKEN_SECRET='ePF0XgybJVViarrsOai7j0Pce5mfYYfNu26ZH7L0oUBrb'
auth=twitter.oauth.OAuth(OAUTH_TOKEN,OAUTH_TOKEN_SECRET,
CONSUMER_KEY,CONSUMER_SECRET)
twitter_api=twitter.Twitter(auth=auth)
returntwitter_api
#Sampleusage
twitter_api=oauth_login()
#Nothingtoseebydisplayingtwitter_apiexceptthatit'snowa
#definedvariable
printtwitter_api
importjson
deftwitter_trends(twitter_api,woe_id):
#PrefixIDwiththeunderscoreforquerystringparameterization.
#Withouttheunderscore,thetwitterpackageappendstheIDvalue
#totheURLitselfasaspecial-casekeywordargument.
returntwitter_api.trends.place(_id=woe_id)
deftwitter_search(twitter_api,q,max_results=2000000,**kw):
#Seeand
#fordetailsonadvanced
#searchcriteriathatmaybeusefulforkeywordarguments
#See
search_results=twitter_api.search.tweets(q=q,count=200000,**kw)
statuses=search_results['statuses']
#Iteratethroughbatchesofresultsbyfollowingthecursoruntilwe
#reachthedesirednumberofresults,keepinginmindthatOAuthusers
#can"only"make180searchqueriesper15-minuteinterval.See
#
#fordetails.Areasonablenumberofresultsis~1000,although
#thatnumberofresultsmaynotexistforallqueries.
#Enforceareasonablelimit
max_results=min(1000,max_results)
for_inrange(10):
#10*100=1000
try:
next_results=search_results['search_metadata']['next_results']
exceptKeyError,e:
#Nomoreresultswhennext_resultsdoesn'texist
break
#Createadictionaryfromnext_results,whichhasthefollowingform:
#?
max_id=313519052523986943&q=NCAA&include_entities=1
kwargs=dict([kv.split('=')
forkvinnext_results[1:
].split("&")])
search_results=twitter_api.search.tweets(**kwargs)
statuses+=search_results['statuses']
iflen(statuses)>max_results:
break
returnstatuses
defextract_tweet_entities(statuses):
#Seeformoredetailsontweet
#entities
iflen(statuses)==0:
return[],[],[],[],[]
screen_names=[user_mention['screen_name']
forstatusinstatuses
foruser_mentioninstatus['entities']['user_mentions']]
hashtags=[hashtag['text']
forstatusinstatuses
forhashtaginstatus['entities']['hashtags']]
urls=[url['expanded_url']
forstatusinstatuses
forurlinstatus['entities']['urls']]
symbols=[symbol['text']
forstatusinstatuses
forsymbolinstatus['entities']['symbols']]
#Insomecircumstances(suchassearchresults),themediaentity
#maynotappear
ifstatus['entities'].has_key('media'):
media=[media['url']
forstatusinstatuses
formediainstatus['entities']['media']]
else:
media=[]
returnscreen_names,hashtags,urls,media,symbols
deffind_popular_tweets(twitter_api,statuses,retweet_threshold=3):
#Youcouldalsoconsiderusingthefavorite_countparameteraspartof
#thisheuristic,possiblyusingittoprovideanadditionalboostto
#populartweetsinarankedformulation
return[status
forstatusinstatuses
ifstatus['retweet_count']>retweet_threshold]
fromcollectionsimportCounter
defget_common_tweet_entities(statuses,entity_threshold=3):
#Createaflatlistofalltweetentities
tweet_entities=[e
forstatusinstatuses
forentity_typeinextract_tweet_entities([status])
foreinentity_type
]
c=Counter(tweet_entities).most_common()
#Computefrequencies
return[(k,v)
for(k,v)inc
ifv>=entity_threshold
]
importre
defget_rt_attributions(tweet):
#RegexadaptedfromStackOverflow(http:
//bit.ly/1821y0J)
rt_patterns=pile(r"(RT|via)((?
:
\b\W*@\w+)+)",re.IGNORECASE)
rt_attributions=[]
#Inspectthetweettoseeifitwasproducedwith/statuses/retweet/:
id.
#See
iftweet.has_key('retweeted_status'):
attribution=tweet['retweeted_status']['user']['screen_name'].lower()
rt_attributions.append(attribution)
#Also,inspectthetweetforthepresenceof"legacy"retweetpatterns
#suchas"RT"and"via",whicharestillwidelyusedforvariousreasons
#andpotentiallyveryuseful.See
#andforsomedetailsonhow/why.
try:
rt_attributions+=[
mention.strip()
formentioninrt_patterns.findall(tweet['text'])[0][1].split()
]
exceptIndexError,e:
pass
#Filteroutanyduplicates
returnlist(set([rta.strip("@").lower()forrtainrt_attributions]))
importpymongo#pipinstallpymongo
defsave_to_mongo(data,mongo_db,mongo_db_coll,**mongo_conn_kw):
#ConnectstotheMongoDBserverrunningon
#localhost:
27017bydefault
client=pymongo.MongoClient(**mongo_conn_kw)
#Getareferencetoaparticulardatabase
db=client[mongo_db]
#Referenceaparticularcollectioninthedatabase
coll=db[mongo_db_coll]
#PerformabulkinsertandreturntheIDs
returncoll.insert(data)
defload_from_mongo(mongo_db,mongo_db_coll,return_cursor=False,
criteria=None,projection=None,**mongo_conn_kw):
#Optionally,usecriteriaandprojectiontolimitthedatathatis
#returnedasdocumentedin
#http:
//docs.mongodb.org/manual/reference/method/db.collection.find/
#ConsiderleveragingMongoDB'saggregationsframeworkformore
#sophisticatedqueries.
client=pymongo.MongoClient(**mongo_conn_kw)
db=client[mongo_db]
coll=db[mongo_db_coll]
ifcriteriaisNone:
criteria={}
ifprojectionisNone:
cursor=coll.find(criteria)
else:
cursor=coll.find(criteria,projection)
#Returningacursorisrecommendedforlargeamountsofdata
ifreturn_cursor:
returncursor
else:
return[itemforitemincursor]
#save_to_mongo(results,'search_results005',q)
##load_from_mongo('search_results005',q)
importsys
importtime
fromurllib2importURLError
fromhttplibimportBadStatusLine
defmake_twitter_request(twitter_api_func,max_errors=10,*args,**kw):
#AnestedhelperfunctionthathandlescommonHTTPErrors.Returnanupdated
#valueforwait_periodiftheproblemisa500levelerror.Blockuntilthe
#ratelimitisresetifit'saratelimitingissue(429error).ReturnsNone
#for401and404errors,whichrequiresspecialhandlingbythecaller.
defhandle_twitter_http_error(e,wait_period=2,sleep_when_rate_limited=True):
ifwait_period>3600:
#Seconds
print>>sys.stderr,'Toomanyretries.Quitting.'
raisee
#Seeforcommoncodes
ife.e.code==401:
print>>sys.stderr,'Encountered401Error(NotAuthorized)'
returnNone
elife.e.code==404:
print>>sys.stderr,'Encountered404Error(NotFound)'
returnNone
elife.e.code==429:
print>>sys.stderr,'Encountered429Error(RateLimitExceeded)'
ifsleep_when_rate_limited:
print>>sys.stderr,"Retryingin15minutes...ZzZ..."
sys.stderr.flush()
time.sleep(60*15+5)
print>>sys.stderr,'...ZzZ...Awakenowandtryingagain.'
return2
else:
raisee#Callermusthandletheratelimitingissue
elife.e.codein(500,502,503,504):
print>>sys.stderr,'Encountered%iError.Retryingin%iseconds'%\
(e.e.code,wait_period)
time.sleep(wait_period)
wait_period*=1.5
returnwait_period
else:
raisee
#Endofnestedhelperfunction
wait_period=2
error_count=0
whileTrue:
try:
returntwitter_api_func(*args,**kw)
excepttwitter.api.TwitterHTTPError,e:
error_count=0
wait_period=handle_twitter_http_error(e,wait_period)
ifwait_periodisNone:
return
exceptURLError,e:
error_count+=1
print>>sys.stderr,"URLErrorencountered.Continuing."
iferror_count>max_errors:
print>>sys.stderr,"Toomanyconsecutiveerrors...bailingout."
raise
exceptBadStatusLine,e:
error_count+=1
print>>sys.stderr,"BadStatusLineencountered.Continuing."
iferror_count>max_errors:
print>>sys.stderr,"Toomanyconsecutiveerrors...bailingout."
raise
defharvest_user_timeline(twitter_api,screen_name=None,user_id=None,max_results=1000):
assert(screen_name!
=None)!
=(user_id!
=None),\
"Musthavescreen_nameoruser_id,butnotboth"
kw={#KeywordargsfortheTwitterAPIcall
'count':
200,
'trim_user':
'true',
'include_rts':
'true',
'since_id':
1
}
ifscreen_name:
kw['screen_name']=screen_name
else:
kw['user_id']=user_id
max_pages=10
results=[]
tweets=make_twitter_request(twitter_api.statuses.user_timeline,**kw)
iftweetsisNone:
#401(NotAuthorized)-Needtobailoutonloopentry
tweets=[]
results+=tweets
print>>sys.stderr,'Fetched%itweets'%len(tweets)
page_num=1
#ManyTwitteraccountshavefewerthan200tweetssoyoudon'twanttoenter
#theloopandwasteapreciousrequestifmax_re
- 配套讲稿:
如PPT文件的首页显示word图标,表示该PPT已包含配套word讲稿。双击word图标可打开word文档。
- 特殊限制:
部分文档作品中含有的国旗、国徽等图片,仅作为作品整体效果示例展示,禁止商用。设计者仅对作品中独创性部分享有著作权。
- 关 键 词:
- 磊磊 核心技术 全自动 抓取 热门 主题 以及 引用 回复 用户 最新