#!/bin/bash files="img-1*" start=1 rename=1 echo $count for i in `ls $files` do echo $i rename=$(($start * 2 - 1)) #rename=$(($start * 2)) start=$(($start + 1)) echo $rename refilename=`printf "%02u" $rename` mv $i $refilename.tif done
2019年11月10日 星期日
bash shell 批次更改圖片檔名範例
Ubuntu Elasticsearch 使用 IK Analysis Plugin
下載新版的 IK Analysis Plugin (這裡以 Ver 7.4.2為例),並且下載擴充中文字典檔案
設定擴充字典,編輯 IKAnalyzer.cfg.xml 設定檔案
沒問題的話,就來測試一下 IK 吧...
接著來建立一個新的 index 就叫做 test 吧,順便進行一下IK分詞測試看看...
sudo mkdir /usr/share/elasticsearch/plugins/ik/ sudo cd /usr/share/elasticsearch/plugins/ik/ sudo wget https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v7.4.2/elasticsearch-analysis-ik-7.4.2.zip sudo unzip elasticsearch-analysis-ik-7.4.2.zip sudo \rm elasticsearch-analysis-ik-7.4.2.zip sudo cd /usr/share/elasticsearch/plugins/ik/config sudo wget https://github.com/samejack/sc-dictionary/raw/master/main.txt sudo mv main.dic main.dic.old sudo mv main.txt main.dic
設定擴充字典,編輯 IKAnalyzer.cfg.xml 設定檔案
cd /usr/share/elasticsearch/plugins/ik/config sudo vi IKAnalyzer.cfg.xml將 ext_dict 新增 main.dic
<entry key="ext_dict">main.dic</entry>重新啟動ElasticSearch
sudo service elasticsearch restart
沒問題的話,就來測試一下 IK 吧...
curl -XGET http://localhost:9200/_analyze -H 'Content-Type:application/json' -d' { "text":"後悔莫及的人家", "analyzer": "ik_smart" }'應該會得到這樣的結果:
{ "tokens" : [ { "token" : "後悔莫及", "start_offset" : 0, "end_offset" : 4, "type" : "CN_WORD", "position" : 0 }, { "token" : "的", "start_offset" : 4, "end_offset" : 5, "type" : "CN_CHAR", "position" : 1 }, { "token" : "人家", "start_offset" : 5, "end_offset" : 7, "type" : "CN_WORD", "position" : 2 } ] }
接著來建立一個新的 index 就叫做 test 吧,順便進行一下IK分詞測試看看...
curl -XPUT http://localhost:9200/test curl -XPOST http://localhost:9200/test/_mapping -H 'Content-Type:application/json' -d' { "properties": { "content": { "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_smart" } } }' curl -XPOST http://localhost:9200/test/_create/1 -H 'Content-Type:application/json' -d' {"content":"曾經有一份真摯的感情放在我面前.我沒有珍惜.等到失去的時候才後悔莫及,塵世間最痛苦的事莫過於此.你的劍在我的咽喉上割下去吧!不要再猶豫了!如果上天能夠給我一個再來一次的機會,我一定會對那個女孩子說三個字\"我愛你\",如果非要在這份愛上加一個期限的話,我希望是一萬年。"}' curl -XPOST /test/_search -H 'Content-Type:application/json' -d' { "query": {"match": {"content": "如果"}}, "highlight" : { "pre_tags" : ["結果應該會是如此 :", " ", ""], "fields" : { "content" : {} } } }'"], "post_tags" : ["
{ "took" : 292, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 1, "relation" : "eq" }, "max_score" : 0.40037507, "hits" : [ { "_index" : "test", "_type" : "_doc", "_id" : "1", "_score" : 0.40037507, "_source" : { "content" : """曾經有一份真摯的感情放在我面前.我沒有珍惜.等到失去的時候才後悔莫及,塵世間最痛苦的事莫過於此.你的劍在我的咽喉上割下去吧!不要再猶豫了!如果上天能夠給我一個再來一次的機會,我一定會對那個女孩子說三個字"我愛你",如果非要在這份愛上加一個期限的話,我希望是一萬年。""" }, "highlight" : { "content" : [ """<tag1>如果</tag1>上天能夠給我一個再來一次的機會,我一定會對那個女孩子說三個字"我愛你",<tag1>如果</tag1>非要在這份愛上加一個期限的話,我希望是一萬年。""" ] } } ] } }
標籤:
2019,
中文,
設定,
elasticsearch
2019年10月22日 星期二
Ubuntu 18.04 安裝 Nginx+MySQL+PHP
安裝 Nginx, MySQL:
設定MySQL安全性設定:
安裝PHP:
設定 php.ini,編輯 /etc/php/7.2/fpm/php.ini,將cgi.fix_pathinfo設為0
重新啟動PHP服務:
設定一個新的nginx網站,以adia.cc為例。編輯 /etc/nginx/sites-available/adia.cc檔案:
建立adia.cc網站設定檔連結,並測試設定檔是否正確:
正確的話,應該會出現類似結果:
接著建立一個PHP測試網頁,編輯 /var/www/adia.cc/html/info.php:
接下來利用瀏覽器測試 http://YOUR_IP/ 和 http://YOUR_IP:8080/info.php是否正常~
#
sudo apt-get update sudo apt-get install nginx sudo apt-get install mysql-server
設定MySQL安全性設定:
sudo mysql_secure_installation
安裝PHP:
sudo apt-get install php-fpm php-mysql php-json php-gd
設定 php.ini,編輯 /etc/php/7.2/fpm/php.ini,將cgi.fix_pathinfo設為0
;cgi.fix_pathinfo=1 cgi.fix_pathinfo=0
重新啟動PHP服務:
sudo service php7.2-fpm restart
設定一個新的nginx網站,以adia.cc為例。編輯 /etc/nginx/sites-available/adia.cc檔案:
server { listen 8080; root /var/www/adia.cc/html; index index.php index.html index.htm; access_log /var/log/nginx/adia.cc.access.log; error_log /var/log/nginx/adia.cc.error.log; server_name adia.cc www.adia.cc; location / { try_files $uri $uri/ =404; } location ~ \.php$ { include snippets/fastcgi-php.conf; fastcgi_pass unix:/var/run/php/php7.2-fpm.sock; } location ~ /\.ht { deny all; } }
建立adia.cc網站設定檔連結,並測試設定檔是否正確:
sudo ln -s /etc/nginx/sites-available/adia.cc /etc/nginx/sites-enabled/ sudo nginx -t
正確的話,應該會出現類似結果:
nginx: the configuration file /etc/nginx/nginx.conf syntax is ok nginx: configuration file /etc/nginx/nginx.conf test is successful
接著建立一個PHP測試網頁,編輯 /var/www/adia.cc/html/info.php:
<?php phpinfo(); ?>重新啟動 Nginx 服務:
sudo service nginx restart
接下來利用瀏覽器測試 http://YOUR_IP/ 和 http://YOUR_IP:8080/info.php是否正常~
phpinfo() 執行結果 |
#
Ubuntu 18.04 安裝 SSH Server , 設定為文字開機模式 (disable GUI boot mode)
確定一下 Ubuntu 版本是否為 18.04:
安裝 SSH Server :
關閉GUI開機模式,並且重開機生效:
假若要再重新設定GUI開機模式:
設定 NTP 時間同步主機為 clock.stdtime.gov.tw,編輯 /etc/systemd/timesyncd.conf 檔案
重新啟動時間同步服務後,順便看一下執行狀態:
執行結果已經顯示同步 time server 從原來的 ntp.ubuntu.com 改為 clock.stdtime.gov.tw:
#
lsb_release -a執行結果:
Distributor ID: Ubuntu Description: Ubuntu 18.04.3 LTS Release: 18.04 Codename: bionic
安裝 SSH Server :
sudo apt-get install openssh-server sudo update-rc.d ssh defaults sudo service ssh start
關閉GUI開機模式,並且重開機生效:
sudo systemctl set-default multi-user.target sudo sync;sync;sync;reboot
假若要再重新設定GUI開機模式:
sudo systemctl set-default graphical.target sudo sync;sync;sync;reboot
設定 NTP 時間同步主機為 clock.stdtime.gov.tw,編輯 /etc/systemd/timesyncd.conf 檔案
sudo vi /etc/systemd/timesyncd.conf將內容 #NTP= 前面註解符號(#)刪除,順便設定同步主機後存檔:
#NTP= NTP=clock.stdtime.gov.tw
重新啟動時間同步服務後,順便看一下執行狀態:
sudo service systemd-timesyncd restart sudo service systemd-timesyncd status
執行結果已經顯示同步 time server 從原來的 ntp.ubuntu.com 改為 clock.stdtime.gov.tw:
● systemd-timesyncd.service - Network Time Synchronization (...略...) Status: "Synchronized to time server 211.22.103.158:123 (clock.stdtime.gov.tw)." (...略...)
#
2019年10月14日 星期一
第11屆iT邦幫忙鐵人賽-參考 Python 程式交易 30 天新手入門系列範例
可能需要預先安裝下列 Python Packages:
參考「第11屆iT邦幫忙鐵人賽-參考 Python 程式交易 30 天新手入門系列」系列範例修改,並存成檔名 getTaxies.py:
#
pip install js2py pip install loguru pip install pandas pip install plotly pip install pyquery
參考「第11屆iT邦幫忙鐵人賽-參考 Python 程式交易 30 天新手入門系列」系列範例修改,並存成檔名 getTaxies.py:
import csv import datetime import fractions import json import os import random import re import time import urllib.parse import argparse from random import randint from time import sleep import js2py import loguru import pandas import plotly.graph_objects import pyquery import requests import requests.exceptions now = datetime.datetime.now() proxies = [] proxy = None class Taiex: def __init__(self, date, openPrice, highestPrice, lowestPrice, closePrice): # 日期 self.Date = date # 開盤價 self.OpenPrice = openPrice # 最高價 self.HighestPrice = highestPrice # 最低價 self.LowestPrice = lowestPrice # 收盤價 self.ClosePrice = closePrice # 物件表達式 def __repr__(self): return f'class Taiex {{ Date={self.Date}, OpenPrice={self.OpenPrice}, HighestPrice={self.HighestPrice}, LowestPrice={self.LowestPrice}, ClosePrice={self.ClosePrice} }}' def getProxy(): global proxies if len(proxies) == 0: getProxies() proxy = random.choice(proxies) loguru.logger.debug(f'getProxy: {proxy}') proxies.remove(proxy) loguru.logger.debug(f'getProxy: {len(proxies)} proxies is unused.') return proxy def reqProxies(hour): global proxies proxies = proxies + getProxiesFromProxyNova() proxies = proxies + getProxiesFromGatherProxy() proxies = proxies + getProxiesFromFreeProxyList() proxies = list(dict.fromkeys(proxies)) loguru.logger.debug(f'reqProxies: {len(proxies)} proxies is found.') def getProxies(): global proxies hour = f'{now:%Y%m%d%H}' filename = f'proxies-{hour}.csv' filepath = f'{filename}' if os.path.isfile(filepath): loguru.logger.info(f'getProxies: {filename} exists.') loguru.logger.warning(f'getProxies: {filename} is loading...') with open(filepath, 'r', newline='', encoding='utf-8-sig') as f: reader = csv.DictReader(f) for row in reader: proxy = row['Proxy'] proxies.append(proxy) loguru.logger.success(f'getProxies: {filename} is loaded.') else: loguru.logger.info(f'getProxies: {filename} does not exist.') reqProxies(hour) loguru.logger.warning(f'getProxies: {filename} is saving...') with open(filepath, 'w', newline='', encoding='utf-8-sig') as f: writer = csv.writer(f) writer.writerow([ 'Proxy' ]) for proxy in proxies: writer.writerow([ proxy ]) loguru.logger.success(f'getProxies: {filename} is saved.') def getProxiesFromProxyNova(): proxies = [] countries = [ 'tw', 'jp', 'kr', 'id', 'my', 'th', 'vn', 'ph', 'hk', 'uk', 'us' ] for country in countries: url = f'https://www.proxynova.com/proxy-server-list/country-{country}/' loguru.logger.debug(f'getProxiesFromProxyNova: {url}') loguru.logger.warning(f'getProxiesFromProxyNova: downloading...') response = requests.get(url) if response.status_code != 200: loguru.logger.debug(f'getProxiesFromProxyNova: status code is not 200') continue loguru.logger.success(f'getProxiesFromProxyNova: downloaded.') d = pyquery.PyQuery(response.text) table = d('table#tbl_proxy_list') rows = list(table('tbody:first > tr').items()) loguru.logger.warning(f'getProxiesFromProxyNova: scanning...') for row in rows: tds = list(row('td').items()) if len(tds) == 1: continue js = row('td:nth-child(1) > abbr').text() js = 'let x = %s; x' % (js[15:-2]) ip = js2py.eval_js(js).strip() port = row('td:nth-child(2)').text().strip() proxy = f'{ip}:{port}' proxies.append(proxy) loguru.logger.success(f'getProxiesFromProxyNova: scanned.') loguru.logger.debug(f'getProxiesFromProxyNova: {len(proxies)} proxies is found.') time.sleep(1) return proxies def getProxiesFromGatherProxy(): proxies = [] countries = [ 'Taiwan', 'Japan', 'United States', 'Thailand', 'Vietnam', 'Indonesia', 'Singapore', 'Philippines', 'Malaysia', 'Hong Kong' ] for country in countries: url = f'http://www.gatherproxy.com/proxylist/country/?c={urllib.parse.quote(country)}' loguru.logger.debug(f'getProxiesFromGatherProxy: {url}') loguru.logger.warning(f'getProxiesFromGatherProxy: downloading...') response = requests.get(url) if response.status_code != 200: loguru.logger.debug(f'getProxiesFromGatherProxy: status code is not 200') continue loguru.logger.success(f'getProxiesFromGatherProxy: downloaded.') d = pyquery.PyQuery(response.text) scripts = list(d('table#tblproxy > script').items()) loguru.logger.warning(f'getProxiesFromGatherProxy: scanning...') for script in scripts: script = script.text().strip() script = re.sub(r'^gp\.insertPrx\(', '', script) script = re.sub(r'\);$', '', script) script = json.loads(script) ip = script['PROXY_IP'].strip() port = int(script['PROXY_PORT'].strip(), 16) proxy = f'{ip}:{port}' proxies.append(proxy) loguru.logger.success(f'getProxiesFromGatherProxy: scanned.') loguru.logger.debug(f'getProxiesFromGatherProxy: {len(proxies)} proxies is found.') time.sleep(1) return proxies def getProxiesFromFreeProxyList(): proxies = [] url = 'https://free-proxy-list.net/' loguru.logger.debug(f'getProxiesFromFreeProxyList: {url}') loguru.logger.warning(f'getProxiesFromFreeProxyList: downloading...') response = requests.get(url) if response.status_code != 200: loguru.logger.debug(f'getProxiesFromFreeProxyList: status code is not 200') return loguru.logger.success(f'getProxiesFromFreeProxyList: downloaded.') d = pyquery.PyQuery(response.text) trs = list(d('table#proxylisttable > tbody > tr').items()) loguru.logger.warning(f'getProxiesFromFreeProxyList: scanning...') for tr in trs: tds = list(tr('td').items()) ip = tds[0].text().strip() port = tds[1].text().strip() proxy = f'{ip}:{port}' proxies.append(proxy) loguru.logger.success(f'getProxiesFromFreeProxyList: scanned.') loguru.logger.debug(f'getProxiesFromFreeProxyList: {len(proxies)} proxies is found.') return proxies # 取得指定年月內每交易日的盤後資訊 def getTaiexs(year, month): global proxy taiexs = [] while True: if proxy is None: proxy = getProxy() url = f'https://www.twse.com.tw/indicesReport/MI_5MINS_HIST?response=json&date={year}{month:02}01' loguru.logger.info(f'getTaiexs: month {month} url is {url}') loguru.logger.warning(f'getTaiexs: month {month} is downloading...') try: response = requests.get( url, proxies={ 'https': f'https://{proxy}' }, timeout=3 ) if response.status_code != 200: loguru.logger.success(f'getTaiexs: month {month} status code is not 200.') proxy = None break loguru.logger.success(f'getTaiexs: month {month} is downloaded.') body = response.json() stat = body['stat'] if stat != 'OK': loguru.logger.error(f'getTaiexs: month {month} responses with error({stat}).') break records = body['data'] if len(records) == 0: loguru.logger.success(f'getTaiexs: month {month} has no data.') break for record in records: date = record[0].strip() parts = date.split('/') y = int(parts[0]) + 1911 m = int(parts[1]) d = int(parts[2]) date = f'{y}{m:02d}{d:02d}' openPrice = record[1].replace(',', '').strip() highestPrice = record[2].replace(',', '').strip() lowestPrice = record[3].replace(',', '').strip() closePrice = record[4].replace(',', '').strip() taiex = Taiex( date=date, openPrice=openPrice, highestPrice=highestPrice, lowestPrice=lowestPrice, closePrice=closePrice ) taiexs.append(taiex) except requests.exceptions.ConnectionError: loguru.logger.error(f'getTaiexs: proxy({proxy}) is not working (connection error).') proxy = None continue except requests.exceptions.ConnectTimeout: loguru.logger.error(f'getTaiexs: proxy({proxy}) is not working (connect timeout).') proxy = None continue except requests.exceptions.ProxyError: loguru.logger.error(f'getTaiexs: proxy({proxy}) is not working (proxy error).') proxy = None continue except requests.exceptions.SSLError: loguru.logger.error(f'getTaiexs: proxy({proxy}) is not working (ssl error).') proxy = None continue except Exception as e: loguru.logger.error(f'getTaiexs: proxy({proxy}) is not working.') loguru.logger.error(e) proxy = None continue break return taiexs # 儲存傳入的盤後資訊 def saveTaiexs(filepath, taiexs): loguru.logger.info(f'saveTaiexs: {len(taiexs)} taiexs.') loguru.logger.warning(f'saveTaiexs: {filepath} is saving...') with open(filepath, mode='w', newline='', encoding='utf-8-sig') as f: writer = csv.writer(f) writer.writerow([ 'Date', 'OpenPrice', 'HighestPrice', 'LowestPrice', 'ClosePrice' ]) for taiex in taiexs: writer.writerow([ taiex.Date, taiex.OpenPrice, taiex.HighestPrice, taiex.LowestPrice, taiex.ClosePrice ]) loguru.logger.success(f'main: {filepath} is saved.') def main(args): taiexs = [] thisYear = 2019 if args.year: loguru.logger.info(f'[-y|--year] [value:{args.year}]') thisYear = int(args.year) else: loguru.logger.info(f'[-y|--year] is not used, set year in '+str(thisYear)) # 取得從 2019.01 至 2019.12 的盤後資訊 for month in range(1, 13): # 程式暫停 3~15 秒 sleep(randint(3, 15)) taiexs = taiexs + getTaiexs(thisYear, month) filepath = f'taiexs-'+str(thisYear)+'.csv' saveTaiexs(filepath, taiexs) # 使用 Pandas 讀取下載回來的紀錄檔 df = pandas.read_csv(filepath) # 將 Date 欄位按照格式轉換為 datetime 資料 df['Date'] = pandas.to_datetime(df['Date'], format='%Y%m%d') # 建立圖表 figure = plotly.graph_objects.Figure( data=[ # Line Chart # 收盤價 plotly.graph_objects.Scatter( x=df['Date'], y=df['ClosePrice'], name='收盤價', mode='lines', line=plotly.graph_objects.scatter.Line( color='#6B99E5' ) ), # Candlestick Chart # K 棒 plotly.graph_objects.Candlestick( x=df['Date'], open=df['OpenPrice'], high=df['HighestPrice'], low=df['LowestPrice'], close=df['ClosePrice'], name='盤後資訊', ) ], # 設定 XY 顯示格式 layout=plotly.graph_objects.Layout( xaxis=plotly.graph_objects.layout.XAxis( tickformat='%Y-%m' ), yaxis=plotly.graph_objects.layout.YAxis( tickformat='.2f' ) ) ) figure.show() if __name__ == '__main__': parser = argparse.ArgumentParser() # -y [value:yyyy] # -- year [value:yyyy] parser.add_argument( '-y', '--year', help='set year in yyyy format', type=int ) args = parser.parse_args() loguru.logger.add( f'{datetime.date.today():%Y%m%d}.log', rotation='1 day', retention='7 days', level='DEBUG' ) main(args)範例以2007年為例,執行下列:
python getTaxies.py -y 2007執行結果:
python getTaixes.py -y 2007 2019-10-14 03:34:40.292 | INFO | __main__:main:300 - [-y|--year] [value:2007] 2019-10-14 03:34:44.294 | INFO | __main__:getProxies:69 - getProxies: proxies-2019101403.csv exists. 2019-10-14 03:34:44.298 | WARNING | __main__:getProxies:70 - getProxies: proxies-2019101403.csv is loading... 2019-10-14 03:34:44.308 | SUCCESS | __main__:getProxies:76 - getProxies: proxies-2019101403.csv is loaded. 2019-10-14 03:34:44.313 | DEBUG | __main__:getProxy:50 - getProxy: 180.250.216.242:3128 2019-10-14 03:34:44.317 | DEBUG | __main__:getProxy:52 - getProxy: 724 proxies is unused. 2019-10-14 03:34:44.322 | INFO | __main__:getTaiexs:206 - getTaiexs: month 1 url is https://www.twse.com.tw/indicesReport/MI_5MINS_HIST?response=json&date=20070101 ...(略)... 2019-10-14 03:36:58.484 | INFO | __main__:getTaiexs:206 - getTaiexs: month 12 url is https://www.twse.com.tw/indicesReport/MI_5MINS_HIST?response=json&date=20071201 2019-10-14 03:36:58.489 | WARNING | __main__:getTaiexs:207 - getTaiexs: month 12 is downloading... 2019-10-14 03:36:58.846 | SUCCESS | __main__:getTaiexs:220 - getTaiexs: month 12 is downloaded. 2019-10-14 03:36:58.850 | INFO | __main__:saveTaiexs:275 - saveTaiexs: 247 taiexs. 2019-10-14 03:36:58.853 | WARNING | __main__:saveTaiexs:276 - saveTaiexs: taiexs-2007.csv is saving... 2019-10-14 03:36:58.864 | SUCCESS | __main__:saveTaiexs:294 - main: taiexs-2007.csv is saved.
2007年大盤指數 |
#
2019年7月27日 星期六
Python 3 + Google 新聞爬蟲 + 罔拍 MONPA 中文正體分詞
import requests from bs4 import BeautifulSoup import monpa res = requests.get("https://news.google.com.tw") soup = BeautifulSoup(res.text) count = 1 for item in soup.select(".MQsxIb"): print('=====[',count,']=====') news_title = item.select(".ipQwMb")[0].text news_url = item.select(".ipQwMb")[0].find('a')['href'] print(news_title) result = monpa.pseg(news_title) for t in result: print(t) count+=1 #print(news_url)執行結果:
=====[ 1 ]===== 國安私菸案》檢調大搜索華航總部 歷時9小時公關室也沒放過 ['國安', 'ORG'] ['私菸案', 'Na'] ['》', 'PARENTHESISCATEGORY'] ['檢調', 'Na'] ['大', 'VH'] ['搜索', 'VC'] ['華航', 'ORG'] ['總部', 'Nc'] [' ', 'D'] ['歷時', 'VJ'] ['9', 'Neu'] ['小時', 'Nf'] ['公關室', 'Nc'] ['也', 'D'] ['沒', 'D'] ['放過', 'VC'] =====[ 2 ]===== 快訊/國安局私菸事件華航首波懲處拔官名單出爐| 交通氣象 ['快訊', 'Na'] ['/', 'PERIODCATEGORY'] ['國安局', 'ORG'] ['私菸', 'VA'] ['事件', 'Na'] ['華航', 'ORG'] ['首', 'Nes'] ['波', 'Nf'] ['懲處', 'Nv'] ['拔官', 'Na'] ['名單', 'Na'] ['出爐', 'VH'] ['|', 'FW'] [' ', 'VC'] ['交通', 'Na'] ['氣象', 'Na'] =====[ 3 ]===== 國安私菸案》吳宗憲傳致電黃川禎:我訂那麼多 華航高層知道嗎 ['國安', 'Nb'] ['私菸案', 'Na'] ['》', 'PARENTHESISCATEGORY'] ['吳宗憲', 'PER'] ['傳', 'VD'] ['致電', 'VC'] ['黃川禎', 'PER'] [':', 'COLONCATEGORY'] ['我', 'Nh'] ['訂', 'VC'] ['那麼多', 'Neqa'] [' 華航', 'PER'] ['高層', 'Na'] ['知道', 'VK'] ['嗎', 'T'] =====[ 4 ]===== 華航公布14次專機負責人!開鍘羅雅美、邱彰信 ['華航', 'PER'] ['公布', 'VE'] ['14', 'Neu'] ['次', 'Nf'] ['專機', 'Na'] ['負責人', 'Na'] ['!', 'EXCLAMATIONCATEGORY'] ['開', 'VC'] ['鍘羅雅美', 'PER'] ['、', 'PAUSECATEGORY'] ['邱彰信', 'PER'] =====[ 5 ]===== 華航開鍘!公布14次專機負責人羅雅美邱彰信調非主管職| 政治 ['華航', 'PER'] ['開鍘', 'Na'] ['!', 'EXCLAMATIONCATEGORY'] ['公布', 'VE'] ['14', 'Neu'] ['次', 'Nf'] ['專機', 'Na'] ['負責人', 'Na'] ['羅雅美', 'PER'] ['邱彰信調', 'PER'] ['非', 'VG'] ['主管', 'Na'] ['職', 'Na'] ['|', 'FW'] [' 政治', 'PER'] =====[ 6 ]===== 只是賣太多菸?他揭華航最大漏洞 ['只是', 'D'] ['賣', 'VD'] ['太多', 'Neqa'] ['菸', 'Na'] ['?', 'QUESTIONCATEGORY'] ['他', 'Nh'] ['揭', 'VC'] ['華航', 'ORG'] ['最', 'Dfa'] ['大', 'VH'] ['漏洞', 'Na'] =====[ 7 ]===== 黃國昌揭私菸案 洪慈庸:綠支持者問「這樣好嗎?」 ['黃國昌', 'PER'] ['揭', 'VC'] ['私菸案', 'Na'] [' 洪慈庸', 'PER'] [':', 'COLONCATEGORY'] ['綠', 'VH'] ['支持', 'VC'] ['者', 'Na'] ['問', 'VE'] ['「', 'PARENTHESISCATEGORY'] ['這樣', 'VH'] ['好', 'VH'] ['嗎', 'T'] ['?', 'QUESTIONCATEGORY'] ['」', 'PARENTHESISCATEGORY'] =====[ 8 ]===== 時力分裂?黨內同志要約黃國昌見面 結果是...... ['時力', 'Na'] ['分裂', 'VHC'] ['?', 'QUESTIONCATEGORY'] ['黨內', 'Nc'] ['同志', 'Na'] ['要', 'D'] ['約', 'VC'] ['黃國昌', 'PER'] ['見面 ', 'Na'] ['結果', 'Na'] ['是', 'SHI'] ['......', 'ETCCATEGORY'] =====[ 9 ]===== 內鬨?洪慈庸:綠基層焦慮黃國昌:追弊案不可看顏色 ['內鬨', 'Na'] ['?', 'QUESTIONCATEGORY'] ['洪慈庸', 'PER'] [':', 'COLONCATEGORY'] ['綠', 'VH'] ['基層', 'A'] ['焦慮', 'Nv'] ['黃國昌', 'PER'] [':', 'COLONCATEGORY'] ['追', 'VC'] ['弊案', 'Na'] ['不可', 'D'] ['看', 'VC'] ['顏色', 'Na']
2019年4月30日 星期二
python 抓取上市及上櫃公司清單, 並寫入 MySQL 資料庫
不囉嗦,直接看程式碼:
# 如何獲得上市上櫃股票清單 import requests import time import pandas as pd from sqlalchemy import create_engine from sqlalchemy.types import NVARCHAR, Date def getTWSE(str_mode): # 設定爬蟲程式的 User_Agent headers = {'user-agent': 'Mozilla/5.0 (Macintosh Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'} # 上市: http://isin.twse.com.tw/isin/C_public.jsp?strMode=2 # 上櫃: http://isin.twse.com.tw/isin/C_public.jsp?strMode=4 req_url = "http://isin.twse.com.tw/isin/C_public.jsp?strMode=%s" % (str_mode) res = requests.get(req_url, headers=headers) df = pd.read_html(res.text)[0] # 設定column名稱 df.columns = df.iloc[0] # 刪除第一行 df = df.iloc[1:] # 先移除row,再移除column,超過三個NaN則移除 df = df.dropna(thresh=3, axis=0).dropna(thresh=3, axis=1) df[['有價證券代號','名稱']] = df['有價證券代號及名稱'].str.split(n=1, expand=True) del df['有價證券代號及名稱'] df = df.set_index('有價證券代號') return df engine = create_engine("mysql+pymysql://stockuser:password@127.0.0.1:3306/stockdb") dtypedict = { '有價證券代號':NVARCHAR(length=32), '上市日': Date() } # 抓取上市公司股票清單, 寫入MySQL > stockdb > listed_code (listed_code table存在時整個取代) listed_companies = "2" #上市公司 mydf1 = getTWSE(listed_companies) mydf1.to_sql(name="listed_code", con=engine, if_exists = 'replace', index=True, dtype=dtypedict) # 先睡個10秒鐘吧 time.sleep(10) # 抓取上櫃公司股票清單, 寫入MySQL > stockdb > listed_code (append 到 listed_code table) listed_companies = "4" #上櫃公司 mydf2 = getTWSE(listed_companies) mydf2.to_sql(name="listed_code", con=engine, if_exists = 'append', index=True, dtype=dtypedict)
2019年4月29日 星期一
python 讀取 MySQL 資料
sqlalchemy連結MySQL時,必須先安裝另一模組 pymysql
pip install pymysql接下來利用 python 來讀取先前我們用R寫入的股價資料庫吧
%matplotlib inline import pandas as pd from sqlalchemy import create_engine engine = create_engine("mysql+pymysql://stockuser:password@127.0.0.1:3306/stockdb") sql ="select * from tw1101;" df = pd.read_sql_query(sql, engine) print(df) df.close.plot()執行結果:
row_names date open high low close \ 0 1 2009-01-02 24.193001 24.193001 24.193001 24.193001 1 2 2009-01-05 25.135599 25.225300 24.597000 24.821400 2 3 2009-01-06 24.955999 25.135599 24.372499 24.641800 3 4 2009-01-07 25.090700 25.449800 24.417400 25.225300 4 5 2009-01-08 24.372499 24.731600 23.789000 24.417400 5 6 2009-01-09 24.552099 25.090700 23.789000 23.789000 ... volume adjusted 0 0.0 14.254436 1 22712510.0 14.624688 2 20298561.0 14.518868 3 26370751.0 14.862664 4 23748492.0 14.386651 ... 2536 30751985.0 41.000000 2537 23165336.0 41.599998 2538 16390191.0 41.549999 2539 13459252.0 41.799999 2540 11300709.0 41.750000 [2541 rows x 8 columns]
2019年4月28日 星期日
R tidyquant 抓取股價資料寫入 MySQL 資料庫
MySQL上先建立一個新使用者 stockuser :
Amazon RDS + R (其實我是租不起, 所以自建 MySQL...)
R學習筆記 - 資料工程篇(四) Database
mysql -u root -p出現 mysql > 後,開始建立新用者資料並設定密碼:
mysql > mysql > GRANT ALL PRIVILEGES ON *.* TO 'stockuser'@'%' IDENTIFIED BY 'password'; mysql > FLUSH PRIVILEGES; mysql > quit;若MySQL伺服器要開放遠端連結,記得防火牆允入3306埠,另外 MySQL的設定檔也要檢查:
vi /etc/mysql/my.cnf記得 bind-address = 127.0.0.1 這行要註解
#bind-address = 127.0.0.1也記得 MySQL 要重新啟動
service mysql restart資料庫部分沒問題的話,就可以準備來使用R抓取股價資料~R 如果還沒有安裝過 RMySQL package, 也記得先安裝:
install.packages("RMySQL")確認安裝完成後,就開始吧...
library(RMySQL) library(DBI) library(tidyquant) library(dplyr) cn <- dbConnect( drv = RMySQL::MySQL(), username='stockuser', password='password', host='127.0.0.1', port = 3306, dbname = "stockdb" ) # 利用 tidyquant 取得 Yahoo 上的台灣五十股價資料, 寫入 MySQL stockdb > tw{股票代碼} tw50 <- c("1101","1102","1216") #只有示範三檔股票 for (i in tw50) { stockid <- paste(i,"TW",sep=".") tblname <- paste("tw",i,sep="") print(stockid) sdata <- tq_get(stockid) dbWriteTable(cn, name=tblname, value=sdata, overwrite=TRUE) Sys.sleep(10) # 每次 Loop 先休息 10 秒 } # 利用 tidyquant 取得 Yahoo 上的IBM股價資料, 寫入 MySQL stockdb > ibm ibm <- tq_get("IBM") dbWriteTable(cn, name="ibm", value=ibm, overwrite=TRUE) dbDisconnect(cn) #記得斷開資料庫連線資料都抓取完成後,MySQL資料庫裡面 stockdb > 就會多個股 table :
mysql -u stockuser -p 輸入密碼,登入後... mysql> use stockdb; Database changed mysql> show tables; +-------------------+ | Tables_in_stockdb | +-------------------+ | ibm | | tw1101 | | tw1102 | | tw1216 | +-------------------+ 4 rows in set (0.00 sec)以後就可以直接從 MySQL 中讀取股價資料來運用:
# 從 MySQL 取出 IBM 股價資料 cn <- src_mysql( user='stockuser', password='password', host='127.0.0.1', port = 3306, dbname = "stockdb" ) ibm_table <- tbl(cn, "ibm") ibm_table dbDisconnect(cn)延伸閱讀資料:
Amazon RDS + R (其實我是租不起, 所以自建 MySQL...)
R學習筆記 - 資料工程篇(四) Database
訂閱:
文章 (Atom)