2017-04-12 64 views
0

我使用solrlily來索引和生成基於關鍵字(如「hadoop」,「bigdata」,「計算機科學」等關鍵字twitter數據的搜索結果,存儲在hbaselily solr hbase indexer:添加索引器indexdemo-indexer.xml

一位在HBase的行如下:

838720557562609665:1488801538:180782707: column=json:tweetJSON, timestamp=1488801607097, value={"created_at":"Mon Mar 06 11:58:58 +0000 2017","id":838720557562609665,"i 
              d_str":"838720557562609665","text":"RT @eraser: #Blockchain Technology Breakdown [img] by @FollowMyVote #fintech #BigData #IoT 
              #insurtech #cryptocurrency\x5Cu2026 ","source":"\x5Cu003ca href=\x5C"https:\x5C/\x5C/about.twitter.com\x5C/products\x5C/tweetd 
              eck\x5C" rel=\x5C"nofollow\x5C"\x5Cu003eTweetDeck\x5Cu003c\x5C/a\x5Cu003e","truncated":false,"in_reply_to_status_id":null,"in_r 
              eply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"i 
              d":180782707,"id_str":"180782707","name":"bitiji","screen_name":"bitiji","location":"Zevilla ciberespaci\x5Cu00e1","url":"http: 
              \x5C/\x5C/bitiji.com","description":"Nac\x5Cu00ed, crec\x5Cu00ed, me viraliz\x5Cu00e9 y mor\x5Cu00ed... y vuerta a empez\x5Cu0 
              0e1. hacia el infinito y + all\x5Cu00e1 .\x5Cr\x5CnMatria del eco(NO)sistema @bitiji","protected":false,"verified":false,"foll 
              owers_count":964,"friends_count":700,"listed_count":157,"favourites_count":124,"statuses_count":17870,"created_at":"Fri Aug 20 
              13:31:49 +0000 2010","utc_offset":3600,"time_zone":"Madrid","geo_enabled":true,"lang":"es","contributors_enabled":false,"is_tra 
              nslator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/profile_bac 
              kground_images\x5C/162532234\x5C/bitiji_avatartwitter.png","profile_background_image_url_https":"https:\x5C/\x5C/pbs.twimg.com\ 
              x5C/profile_background_images\x5C/162532234\x5C/bitiji_avatartwitter.png","profile_background_tile":true,"profile_link_color":" 
              0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_us 
              e_background_image":true,"profile_image_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/profile_images\x5C/2859966744\x5C/2f056cd86881e4 
              91f42c4bd942f5c5be_normal.png","profile_image_url_":"\x5C/\x5C/pbs.twimg.com\x5C/profile_images\x5C/2859966744\x5C/2 
              f056cd86881e491f42c4bd942f5c5be_normal.png","profile_banner_url":":\x5C/\x5C/pbs.twimg.com\x5C/profile_banners\x5C/1807827 
              07\x5C/1398242563","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notificat 
              ions":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Mar 01 13:35:0 
              5 +0000 2017","id":836932805879820288,"id_str":"836932805879820288","text":"#Blockchain Technology Breakdown [img] by @FollowMy 
              Vote #fintech #BigData #IoT #insurtech #cryptocurrency\x5Cu2026 :\x5C/\x5C/t.co\x5C/KuYmu4lh8A","display_text_range":[0,1 
              40],"source":"\x5Cu003ca href=\x5C"http:\x5C/\x5C/www.hootsuite.com\x5C" rel=\x5C"nofollow\x5C"\x5Cu003eHootsuite\x5Cu003c\x5C/ 
              a\x5Cu003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply 
              _to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":3122211,"id_str":"3122211","name":"eraser ju\x5Cu24b6njo * \x 
              5Cu2718 \x5Cu2605","screen_name":"eraser","location":"Sevilla","url":"http:\x5C/\x5C/e-learning-teleformacion.blogspot.com","de 
              scription":"PhD student @fceyeUS @unisevilla elige la clave dela vida abcchdefghij... \x5Cu2718\x5Cu24d4-\x5Cu24dd\x5Cu24d0\x5C 
              u24e4\x5Cu24e3\x5Cu24d0, \x5Cu24d4-\x5Cu24dc\x5Cu24d4\x5Cu24dd\x5Cu24e3\x5Cu24d4 Sevilla \x5Cu2605 elearning \x5Cu2605\x5Cu24b6 
              r\x5Cu24e3\x5Cu2605 education \x5Cu2605 P2P \x5Cu2605 blockchain \x5Cu2605 economy","protected":false,"verified":false,"followe 
              rs_count":21208,"friends_count":11566,"listed_count":2074,"favourites_count":4946,"statuses_count":474839,"created_at":"Sun Apr 
              01 12:12:45 +0000 2007","utc_offset":3600,"time_zone":"Madrid","geo_enabled":true,"lang":"en","contributors_enabled":false,"is 
              _translator":false,"profile_background_color":"9AE4E8","profile_background_image_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/profile 
              _background_images\x5C/880489560\x5C/e145d4701fc8ad1b84d114cc2fd7c996.jpeg","profile_background_image_url_https":"https:\x5C/\x 
              5C/pbs.twimg.com\x5C/profile_background_images\x5C/880489560\x5C/e145d4701fc8ad1b84d114cc2fd7c996.jpeg","profile_background_til 
              e":true,"profile_link_color":"0000FF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E0FF92","profile_te 
              xt_color":"000000","profile_use_background_image":true,"profile_image_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/profile_images\x5C 
              /599157674337509376\x5C/0ZRJcLhV_normal.jpg","profile_image_url_https":"https:\x5C/\x5C/pbs.twimg.com\x5C/profile_images\x5C/59 
              9157674337509376\x5C/0ZRJcLhV_normal.jpg","profile_banner_url":"https:\x5C/\x5C/pbs.twimg.com\x5C/profile_banners\x5C/3122211\x 
              5C/1438841267","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications 
              ":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"# 
              Blockchain Technology Breakdown [img] by @FollowMyVote #fintech #BigData #IoT #insurtech #cryptocurrency #smartcities #DeepLea 
              rning https:\x5C/\x5C/t.co\x5C/PpbSfk3Dta","display_text_range":[0,133],"entities":{"hashtags":[{"text":"Blockchain","indices": 
              [0,11]},{"text":"fintech","indices":[57,65]},{"text":"BigData","indices":[66,74]},{"text":"IoT","indices":[75,79]},{"text":"ins 
              urtech","indices":[80,90]},{"text":"cryptocurrency","indices":[91,106]},{"text":"smartcities","indices":[107,119]},{"text":"Dee 
              pLearning","indices":[120,133]}],"urls":[],"user_mentions":[{"screen_name":"FollowMyVote","name":"FollowMyVote","id":392924202, 
              "id_str":"392924202","indices":[42,55]}],"symbols":[],"media":[{"id":836932802947973120,"id_str":"836932802947973120","indices" 
              :[134,157],"media_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/media\x5C/C51h8zSWAAAb-dk.png","media_url_https":"https:\x5C/\x5C/pbs. 
              twimg.com\x5C/media\x5C/C51h8zSWAAAb-dk.png","url":"https:\x5C/\x5C/t.co\x5C/PpbSfk3Dta","display_url":"pic.twitter.com\x5C/Ppb 
              Sfk3Dta","expanded_url":"https:\x5C/\x5C/twitter.com\x5C/eraser\x5C/status\x5C/836932805879820288\x5C/photo\x5C/1","type":"phot 
              o","sizes":{"large":{"w":800,"h":2000,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":480,"h":1200,"res 
              ize":"fit"},"small":{"w":272,"h":680,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":836932802947973120,"id_str":"83693 
              2802947973120","indices":[134,157],"media_url":"http:\x5C/\x5C/pbs.twimg.com\x5C/media\x5C/C51h8zSWAAAb-dk.png","media_url_http 
              s":"https:\x5C/\x5C/pbs.twimg.com\x5C/media\x5C/C51h8zSWAAAb-dk.png","url":"https:\x5C/\x5C/t.co\x5C/PpbSfk3Dta","display_url": 
              "pic.twitter.com\x5C/PpbSfk3Dta","expanded_url":"https:\x5C/\x5C/twitter.com\x5C/eraser\x5C/status\x5C/836932805879820288\x5C/p 
              hoto\x5C/1","type":"photo","sizes":{"large":{"w":800,"h":2000,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium 
              ":{"w":480,"h":1200,"resize":"fit"},"small":{"w":272,"h":680,"resize":"fit"}}}]}},"retweet_count":15,"favorite_count":5,"entiti 
              es":{"hashtags":[{"text":"Blockchain","indices":[0,11]},{"text":"fintech","indices":[57,65]},{"text":"BigData","indices":[66,74 
              ]},{"text":"IoT","indices":[75,79]},{"text":"insurtech","indices":[80,90]},{"text":"cryptocurrency","indices":[91,106]}],"urls" 
              :[{"url":"https:\x5C/\x5C/t.co\x5C/KuYmu4lh8A","expanded_url":"https:\x5C/\x5C/twitter.com\x5C/i\x5C/web\x5C/status\x5C/8369328 
              05879820288","display_url":"twitter.com\x5C/i\x5C/web\x5C/status\x5C/8\x5Cu2026","indices":[108,131]}],"user_mentions":[{"scree 
              n_name":"FollowMyVote","name":"FollowMyVote","id":392924202,"id_str":"392924202","indices":[42,55]}],"symbols":[]},"favorited": 
              false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0, 
              "favorite_count":0,"entities":{"hashtags":[{"text":"Blockchain","indices":[12,23]},{"text":"fintech","indices":[69,77]},{"text" 
              :"BigData","indices":[78,86]},{"text":"IoT","indices":[87,91]},{"text":"insurtech","indices":[92,102]},{"text":"cryptocurrency" 
              ,"indices":[103,118]}],"urls":[{"url":"","expanded_url":null,"indices":[120,120]}],"user_mentions":[{"screen_name":"eraser","na 
              me":"eraser ju\x5Cu24b6njo * \x5Cu2718 \x5Cu2605","id":3122211,"id_str":"3122211","indices":[3,10]},{"screen_name":"FollowMyVot 
              e","name":"FollowMyVote","id":392924202,"id_str":"392924202","indices":[54,67]}],"symbols":[]},"favorited":false,"retweeted":fa 
              lse,"filter_level":"low","lang":"en","timestamp_ms":"1488801538254"}\x0D\x0A             
838720557562609665:1488801538:180782707: column=tweetdata:coordinates, timestamp=1488801607097, value=NA                 
838720557562609665:1488801538:180782707: column=tweetdata:created_at, timestamp=1488801607097, value=1488801538               
838720557562609665:1488801538:180782707: column=tweetdata:created_time_lucene, timestamp=1488801607097, value=2017-03-06T11:58:58Z          
838720557562609665:1488801538:180782707: column=tweetdata:hashtags, timestamp=1488801607097, value=Blockchain, fintech, BigData, IoT, insurtech, cryptocurrency   
838720557562609665:1488801538:180782707: column=tweetdata:id, timestamp=1488801607097, value=838720557562609665               
838720557562609665:1488801538:180782707: column=tweetdata:in_reply_to_screen_name, timestamp=1488801607097, value=NA              
838720557562609665:1488801538:180782707: column=tweetdata:in_reply_to_status_id, timestamp=1488801607097, value=NA              
838720557562609665:1488801538:180782707: column=tweetdata:in_reply_to_user_id, timestamp=1488801607097, value=NA               
838720557562609665:1488801538:180782707: column=tweetdata:place, timestamp=1488801607097, value=NA                  
838720557562609665:1488801538:180782707: column=tweetdata:retweeted_status_id, timestamp=1488801607097, value=836932805879820288           
838720557562609665:1488801538:180782707: column=tweetdata:retweeted_status_text, timestamp=1488801607097, value=#Blockchain Technology Breakdown [img] by @FollowMyVote 
              #fintech #BigData #IoT #insurtech #cryptocurrency\xE2\x80\xA6          
838720557562609665:1488801538:180782707: column=tweetdata:retweeted_status_user_id, timestamp=1488801607097, value=3122211            
838720557562609665:1488801538:180782707: column=tweetdata:retweeted_status_user_name, timestamp=1488801607097, value=eraser ju\xE2\x92\xB6njo * \xE2\x9C\x98 \xE2\x98\x8 
              5                                
838720557562609665:1488801538:180782707: column=tweetdata:source, timestamp=1488801607097, value=<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">T 
              weetDeck</a>                             
838720557562609665:1488801538:180782707: column=tweetdata:text, timestamp=1488801607097, value=RT @eraser: #Blockchain Technology Breakdown [img] by @FollowMyVote #fin 
              tech #BigData #IoT #insurtech #cryptocurrency\xE2\x80\xA6                  
838720557562609665:1488801538:180782707: column=tweetdata:urls, timestamp=1488801607097, value=                   
838720557562609665:1488801538:180782707: column=tweetdata:usermentions, timestamp=1488801607097, value=eraser, FollowMyVote            
838720557562609665:1488801538:180782707: column=user:followers_count, timestamp=1488801607097, value=964                 
838720557562609665:1488801538:180782707: column=user:following_count, timestamp=1488801607097, value=NA                 
838720557562609665:1488801538:180782707: column=user:friends_count, timestamp=1488801607097, value=700                 
838720557562609665:1488801538:180782707: column=user:id, timestamp=1488801607097, value=180782707                  
838720557562609665:1488801538:180782707: column=user:profile_image_url, timestamp=1488801607097, value=http://pbs.twimg.com/profile_images/2859966744/2f056cd86881e491f4 
              2c4bd942f5c5be_normal.png                          
838720557562609665:1488801538:180782707: column=user:screen_name, timestamp=1488801607097, value=bitiji                 
838720557562609665:1488801538:180782707: column=user:timezone, timestamp=1488801607097, value=Madrid 

我已經能夠設置Solr的百合卻只有一個,即離開adding an indexer最後一步:

./bin/hbase-indexer add-indexer -n myindexer -c indexdemo-indexer.xml \ 
    -cp solr.zk=localhost:2181/solr -cp solr.collection=collection1 

對於上面我需要製作indexdemo-indexer.xml文件。示例:

<?xml version="1.0"?> 
<indexer table="indexdemo-user"> 
    <field name="firstname_s" value="info:firstname"/> 
    <field name="lastname_s" value="info:lastname"/> 
    <field name="age_i" value="info:age" type="int"/> 
</indexer> 

如何爲我的數據(上面提到的hbase示例行)創建以上文件? (注意:我可以使用:柱= tweetdata:#標籤或列= tweetdata:文本爲了這個,我猜,但如何)

回答

0

這裏是讓一個辦法:

<?xml version="1.0"?> 
<indexer table="indextweet"> 
    <field name="id" value="tweetdata:id"/> 
    <field name="created_at" value="tweetdata:created_at"/> 
    <field name="created_time_lucene" value="tweetdata:created_time_lucene"/> 
    <field name="text" value="tweetdata:text"/> 
    <field name="source" value="tweetdata:source"/> 
    <field name="in_reply_to_status_id" value="tweetdata:in_reply_to_status_id"/> 
    <field name="in_reply_to_user_id" value="tweetdata:in_reply_to_user_id"/> 
    <field name="in_reply_to_screen_name" value="tweetdata:in_reply_to_screen_name"/> 
    <field name="coordinates" value="tweetdata:coordinates"/> 
    <field name="place" value="tweetdata:place"/> 
    <field name="retweeted_status_id" value="tweetdata:retweeted_status_id"/> 
    <field name="retweeted_status_text" value="tweetdata:retweeted_status_text"/> 
    <field name="retweeted_status_user_id" value="tweetdata:retweeted_status_user_id"/> 
    <field name="retweeted_status_user_name" value="tweetdata:retweeted_status_user_name"/> 
    <field name="hashtags" value="tweetdata:hashtags"/> 
    <field name="urls" value="tweetdata:urls"/> 
    <field name="usermentions" value="tweetdata:usermentions"/> 
    <field name="userid" value="user:id"/> 
    <field name="screen_name" value="user:screen_name"/> 
    <field name="timezone" value="user:timezone"/> 
    <field name="followers_count" value="user:followers_count"/> 
    <field name="friends_count" value="user:friends_count"/> 
    <field name="following_count" value="user:following_count"/> 
    <field name="profile_image_url" value="user:profile_image_url"/> 
</indexer>