์ง๋ฅํ ๊ธฐ์ ์ํ๊ณ ๋ถ์์ ์ํ ๋ฐ์ดํฐ ์์ง ๋ฐ ๊ฐ๊ณต (Data collection and processing for intelligent technology ecosystem analysis)
- sklearn(TfidfVectorizer, CountVectorizer, PCA)
- KMeans
- DBSCAN
- Research Proceduer
- Item-based Technology Type Analysis
- Topic-based Technology Type Analysis
- Research Result (Github's key Repository Analysis)
- Star-based
- Big Tech Company
- Future Tech : Autonomous-vehicle, Metaverse
- ๊นํ๋ธ ์คํ์์ค ์ ๋ณด ๋ฐ API ๋ถ์
- ๊นํ๋ธ๋ ๋์ฉ๋์ ์คํ์์ค ์ ๋ณด๋ฅผ ํจ๊ณผ์ ์ผ๋ก ํ์ธํ๊ธฐ ์ํ ๋๊ตฌ๋ก API๋ฅผ ์ ๊ณตํ๊ณ ์์ผ๋ฉฐ, ๊ฐ๋ฐ์, ๊ฐ๋ฐํ๊ฒฝ, ํํฉ ๋ฑ ์ฌ๋ฌ ๊ธฐ์ ์์ฑ์ ๊ฒ์ํ ์ ์๋๋ก ์ง์ํ๊ณ ์์.
- ์๋ url์ ํตํด "deep learning" ํค์๋ ๊ฒ์ ๊ฒฐ๊ณผ์ ๋ํ ์ ๋ณด๋ฅผ ์นํ์ด์ง ์์์ API ํํ๋ก ํ์ธ ๊ฐ๋ฅ https://api.github.com/search/repositories?q=deep%20learning&page,per_page,sort,order
- API ์ด์ฉํ์ฌ ํ์ํ ์ ์ฅ์ ์ ๋ณด Crawling test
def topic(t):
topic = t.replace(' ', '%20')
response = urlopen('https://api.github.com/search/repositories?q={}&page,per_page,sort,order'.format(topic)).read().decode('utf-8')
responseJson = json.loads(response)
name_lst = []
type_lst = []
create_lst = []
size_lst = []
star_lst = []
fork_lst = []
login_lst = []
items = responseJson.get('items')
for lst in items:
name = lst.get('name')
typ = lst.get('owner').get('type')
create = lst.get('created_at')
size = lst.get('size')
star = lst.get('stargazers_count')
fork = lst.get('forks_count')
login = lst.get('owner').get('login')
name_lst.append(name)
type_lst.append(typ)
create_lst.append(create)
size_lst.append(size)
star_lst.append(star)
fork_lst.append(fork)
login_lst.append(login)
df = pd.DataFrame([name_lst, type_lst, create_lst, size_lst, star_lst, fork_lst, login_lst])
df = df.transpose()
df.columns = ['name','type','created_at','size','stargazers_count','fork','login']
return df
# test
topic('deep learning')
- ์ ์ฒด ํ์ด์ง Json ํํ๋ก response, Crawling ๋ฐ Excel ํํ ์ ์ฅ
- Github ์์ฒด์์ ์ธํฐํ์ด์ค ๊ธฐ๋ฐ์ ํ์ด์ง ๋ณํ์ ์๋ ํฌ๋กค๋ง ๋ฐฉ์ง๋ฅผ ์ํ ์์ฒญ์๊ฐ ํ์ธ ๋ฑ์ ์ด์์ฌํญ ์กด์ฌ : time.sleep() ํจ์๋ฅผ ํตํด ๋๊ธฐ์๊ฐ ๋ฐ์์์ผ ๋ฐ๋ณต์ ์ธ ๋์ Crawling
def topic(t):
topic = t.replace(' ', '%20')
name_lst = []
type_lst = []
create_lst = []
size_lst = []
star_lst = []
fork_lst = []
login_lst = []
for i in range(1,11):
try:
response = urlopen('https://api.github.com/search/repositories?q={}&sort=stars&per_page=100&page={}'.format(topic, i)).read().decode('utf-8')
except:
time.sleep(10)
responseJson = json.loads(response)
print(f'{i} response')
items = responseJson.get('items')
for lst in items:
name = lst.get('name')
typ = lst.get('owner').get('type')
create = lst.get('created_at')
size = lst.get('size')
star = lst.get('stargazers_count')
fork = lst.get('forks_count')
login = lst.get('owner').get('login')
name_lst.append(name)
type_lst.append(typ)
create_lst.append(create)
size_lst.append(size)
star_lst.append(star)
fork_lst.append(fork)
login_lst.append(login)
# print('{} / {} / {} / {} / {} / {} / {}'.format(name, typ, create, size, star, fork, login))
df = pd.DataFrame([name_lst, type_lst, create_lst, size_lst, star_lst, fork_lst, login_lst])
df = df.transpose()
df.columns = ['name','type','created_at','size','stargazers_count','fork','login']
return df
# test
topic('deep learning')
-> ์์ ๊ฐ์ Crawling ๋ฐฉ์์ผ๋ก ์์ ์ธ์ง๋(star), ๋น ํ ํฌ ๊ธฐ์ (Google, MS, Intel, Facebook, Apple, Amazon ๋ฑ), ๋ฏธ๋๊ธฐ์ (์์จ์ฃผํ์ฐจ, ๋ฉํ๋ฒ์ค) ๊ธฐ์ค Crawling ๋ฐ ๋ฐ์ดํฐ๋ฒ ์ด์คํ
- project name, topic keyword, star number๋ฅผ column์ผ๋ก Crawling ์คํ
stars = input("์คํ์ ์
๋ ฅ : ")
url = "https://github.com/search?p=1&q=stars%3A%3E{}&type=Repositories".format(stars)
def crawling_func(url):
print(url)
try:
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text,"lxml")
p = soup.find_all("a",attrs={"class":"v-align-middle"})
except:
time.sleep(1)
crawling_func(url) # ์ค๋ฅ๋ฅผ ๋๋นํ๊ธฐ์ํด ์ฌ๊ทํจ์ ํธ์ถ
pass
finally:
return p
topic_ads = []
pages = int(input("๊ฒ์ํ ํ์ด์ง ์(Ex : 10) : "))
print("{}๊ฐ ์ด์์ star์๋ฅผ ๊ฐ์ง Repository์ ์์ {}ํ์ด์ง๋ฅผ ํฌ๋กค๋งํฉ๋๋ค. ".format(stars,pages))
print()
for i in range(1,pages+1):# x ํ์ด์ง๊น์ง ํ์
url = "https://github.com/search?p={}&q=stars%3A%3E10000&type=Repositories".format(i) # ํ์ด์ง formatting
time.sleep(15) # 10์ด๋ฅผ ์ฌ์ด๋ ์ค๋ฅ๊ฐ ๋ด์์
for j in crawling_func(url):
topic_ads.append(j.get_text())
print(j.get_text())
topics_dic = {}
topics_list = []
for ad in topic_ads:
url_topic = "https://github.com/" + ad
res_topic = requests.get(url_topic)
res_topic.raise_for_status()
soup_topic = BeautifulSoup(res_topic.text,"lxml")
topic = soup_topic.find("div",attrs={"class":"BorderGrid-cell"}).find_all("a",attrs={"class":"topic-tag topic-tag-link"})
project_topics=[i.get_text().replace("\n","").replace("\t","").strip() for i in topic]
# ์คํ ์(๋ถ๋งํฌ ์, ์ธ๊ธฐ๋)
star_num = soup_topic.find("ul",attrs={"class":"pagehead-actions flex-shrink-0 d-none d-md-inline"}).find("a",attrs={"class":"social-count js-social-count"}).get_text()
star_num = star_num.replace('\t', '').replace('\n', '').strip()
topics_list.append([ad,project_topics,star_num])
for t in project_topics:
if t in topics_dic:
topics_dic[t] += 1
else:
topics_dic[t] = 1
# value๋ก ๋ด๋ฆผ์ฐจ์ ์ ๋ ฌ
topics_dic= sorted(topics_dic.items(), key=lambda x: x[1], reverse=True)
df_topic2 = pd.DataFrame(topics_list,columns=['project_name','topic_keyword','star_number'])
# ์์
๋ก ์ ์ฅํ๊ธฐ
df_topic2.to_excel('Topics_stars{}_project_keyword.xlsx'.format(stars),index=False)
print()
print("star ์๊ฐ {} ๊ฐ ์ด์์ธ ํ๋ก์ ํธ ์์ {}ํ์ด์ง์๋ํ Data๊ฐ ์ ์ฅ๋์์ต๋๋ค.".format(stars,pages))
print("์ข
๋ฃ .")
- ๊ฐ Project๋ณ Topic Keyword ๋ฒกํฐํ ์งํ (TF-IDF) -> ์ ์์ด๋ผ๋ฆฌ ์๋ก ๋ฌถ๋ ์์ ๋ฐ Topic์ ๋์ ํฉ์ ๋์ ๋๋ฆฌ ํํ๋ก ๋ง๋๋ ์์ ํ ์งํ
vectorize = TfidfVectorizer(
min_df=5 # ์์ ๋ก ๋ณด๊ธฐ ์ข๊ฒ 1๋ฒ ์ ๋๋ง ๋
ธ์ถ๋๋ ๋จ์ด๋ค์ ๋ฌด์ํ๊ธฐ๋ก ํ๋ค
# min_df = 0.01 : ๋ฌธ์์ 1% ๋ฏธ๋ง์ผ๋ก ๋ํ๋๋ ๋จ์ด ๋ฌด์
# min_df = 10 : ๋ฌธ์์ 10๊ฐ ๋ฏธ๋ง์ผ๋ก ๋ํ๋๋ ๋จ์ด ๋ฌด์
# max_df = 0.80 : ๋ฌธ์์ 80% ์ด์์ ๋ํ๋๋ ๋จ์ด ๋ฌด์
# max_df = 10 : 10๊ฐ ์ด์์ ๋ฌธ์์ ๋ํ๋๋ ๋จ์ด ๋ฌด์
)
X = vectorize.fit_transform(df['topic_keyword_str'])
print('fit_transform, (sentence {}, feature {})'.format(X.shape[0], X.shape[1]))
# ๋ฌธ์ฅ์์ ๋ฝ์๋ธ feature ๋ค์ ๋ฐฐ์ด
features = vectorize.get_feature_names()
X.toarray()
- ๋น์ทํ Topic๋ผ๋ฆฌ 1์ฐจ DBSCAN Clustering -> PCA๋ก ์ฐจ์์ ์ถ์ํ ํ ์งํ. -> ์ฌ๊ธฐ์๋ ์ ๋ณด๋์ ์์ค์ด 5%๋งํผ๋ง ๋ฐ์ํ๋๋ก 216์ฐจ์์์ 155์ฐจ์์ผ๋ก ์ค์
# ์ฐจ์์ถ์๋ฅผ ํ์ง ์๊ณ ๋ PCA๋ฅผ ๋๋ ค๋ด๋ณด๊ธฐ
# ์ ๋ณด๋์ด 95% ์ธ ๋งํผ์ ์นผ๋ผ์๊ฐ 155์
pca = PCA(n_components=155)
df_pca = pca.fit_transform(tfidf_vector_df)
df_pca = pd.DataFrame(df_pca, index=tfidf_vector_df.index,
columns=[f"pca{num+1}" for num in range(df_pca.shape[1])])
for i in df_dbscan_cluster['clusters']model = DBSCAN(eps=0.4, min_samples=5, metric='cosine')
result = model.fit_predict(df_pca)
set(result)
df_result = df.copy()
df_result['result'] = result
j = 0
keyword = []
topic = []
num = []
for cluster_num in set(result):
if(cluster_num == -1 or cluster_num == 0):
continue # ๋
ธ์ด์ฆ ๋ฐ์ดํฐ๋ค์ ๋ฒ๋ฆผ
else:
print('cluster num : {}'.format(cluster_num))
temp_df = df_result[df_result['result'] == cluster_num]
i = 0
for k in temp_df['topic_keyword_str']:
keyword.append(k)
num.append(cluster_num)
i = i + 1
# print('๊ตฐ์ง ๋ด ๋ฐ์ดํฐ ๊ฐ์: ',i)
for t in temp_df['project_name']:
topic.append(t)
j += i
print()
dic_cluster = {}
dic_cluster['topic'] = topic
dic_cluster['keyword'] = keyword
dic_cluster['number'] = num
clusters = {}
keywords = {}
num = []
for i in set(df_cluster['number']):
n = 0
cluster = []
keyword = []
for j in df_cluster.values:
if j[2] == i:
cluster.append(j[0])
clusters[i] = cluster
keyword += j[1].split(' ')
n += 1
else:
pass
keywords[i] = keyword
num.append(n)
count_items = []
for i in keywords.values():
count = {}
for j in i:
try:
count[j] += 1
except:
count[j] = 1
val = sorted(count.items(), key=lambda x: x[1], reverse=True)
count_items.append(val[:15])
df_cluster_ = pd.DataFrame()
df_cluster_['clusters'] = clusters.values()
df_cluster_['cluster_num'] = clusters.keys()
df_cluster_['count'] = num
df_cluster_['top_15_topics'] = count_items
df_cluster_
- cluster_num = 1์ธ Topic๋ผ๋ฆฌ 2์ฐจ DBSCAN Clustering
num_list = []
for idx, num in enumerate(result):
if num == 1:
num_list.append(df_pca.iloc[idx])
df_result2 = df_result[df_result['result'] == 1]
df_pca2 = pd.DataFrame(num_list)
df_pca2 = df_pca2.loc[~df_pca2.index.duplicated(keep='first')]
tfidf_vector_df2 = pd.merge(tfidf_vector_df, df_pca2, left_index=True, right_index=True, how='inner', sort=False)
tfidf_vector_df2.drop(tfidf_vector_df2.iloc[:,217:], axis=1, inplace=True)
tfidf_vector_df2.loc[~tfidf_vector_df2.index.duplicated(keep='first')]
ca = PCA(n_components=155)
df_pca2 = pca.fit_transform(tfidf_vector_df2)
df_pca2 = pd.DataFrame(df_pca2, index=tfidf_vector_df2.index,
columns=[f"pca{num+1}" for num in range(df_pca2.shape[1])])
model2 = DBSCAN(eps=0.3, min_samples=5, metric='cosine') # parameter ๊ฐ ์ฌ์ค์ ํ์
result2 = model2.fit_predict(df_pca2)
set(result2)
# ์ดํ๋ก๋ ์ ํด๋ฌ์คํฐ๋ง ๋ฐฉ๋ฒ๊ณผ ๋์ผ
- ๊ฒ์ํ ๊ธฐ์ ์ด๋ฆ ๊ธฐ์ค ์ ๋ณด ์์ง
# org = ["aws","facebook","google","naver","kakao","apple","alibaba","tencent","baidu","microsoft","samsung"]
org = list(input("๊ฒ์ํ์ค ๊ธฐ์
์ด๋ฆ์ ์์ด๋ก ์
๋ ฅํด์ฃผ์ธ์.(์ฌ๋ฌ๊ฐ ์ธ๊ฒฝ์ฐ ๋์ด์ฐ๊ธฐ๋ก ๊ตฌ๋ถํ์ฌ ์
๋ ฅ) : ").split())
org_dic={}
for o in org:
url = "https://github.com/orgs/{}/repositories".format(o)
print("{} ์ ๋ํ ์ ๋ณด ์์ง ์์.".format(url))
res= requests.get(url)
try:
res.raise_for_status()
except:
print("์
๋ ฅํ์ ๊ธฐ์
\"{}\" ์ ๋ํ ์ ๋ณด๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.\n".format(o))
continue
soup=BeautifulSoup(res.text,"lxml")
try:
max_page = int(soup.find("div",attrs={"role":"navigation"}).find_all("a")[-2].get_text())
except:
max_page = 1
item_temp = []
for p in range(1,max_page+1):
time.sleep(1)
url = "https://github.com/orgs/{}/repositories?page={}".format(o,p)
res= requests.get(url)
res.raise_for_status()
soup=BeautifulSoup(res.text,"lxml")
print("{}{}์์ง์์{}".format("*"*10,o,"*"*10))
for item in soup.find("div",attrs={"class":"org-repos repo-list"}).find_all("li",attrs={"class":"Box-row"}):
print(item.a.get_text().strip())
item_temp.append(item.a.get_text().strip())
org_dic[o]=item_temp
print()
- DBSCAN Clustering
for o in org:
print("{} {} DBSCAN ํด๋ฌ์คํฐ๋ง ์์ {}".format("*"*10,o,"*"*10))
excel_name = "{}_vectors.xlsx".format(o)
df_org = pd.read_excel("{}.xlsx".format(o))
df_vector = pd.read_excel(excel_name)
# eps ๊ฐ์ ์กฐ์ ํด๋๊ฐ๋ฉด์ ํด๋ฌ์คํฐ๋ง์ ํด์ผ ๋ ์ ํํ ๊ฒฐ๊ณผ๊ฐ ๋์จ๋ค
dbscan = DBSCAN(eps = 0.3)
dbscan_cluster = dbscan.fit_predict(df_vector)
dbscan_cluster
dbscan_clustered_dic = {}
dbscan_clustered_list = []
dbscan_cluster_num = len(set(dbscan_cluster))
for idx,i in enumerate(dbscan_cluster):
if i not in dbscan_clustered_dic:
dbscan_clustered_dic[i] = [df_org['ProjectName'][idx]]
else:
dbscan_clustered_dic[i].append(df_org['ProjectName'][idx])
# ํด๋ฌ์คํฐ๋ง์ด ๋ ํจํค์ง๋ค
# 20๊ฐ์ ๊ตฐ์ง์ผ๋ก ์์ฑ
dbscan_clustered_dic = sorted(dbscan_clustered_dic.items(), key=lambda x: x[0])
df_dbscan_cluster = pd.DataFrame(dbscan_clustered_dic,columns=['num','clusters'])
dbscan_cluster_num = [len(i) for i in df_dbscan_cluster['clusters']]
df_dbscan_cluster['cluster_num'] = dbscan_cluster_num
topic_dbscan_clustered_list = []
for i in df_dbscan_cluster['clusters']:
temp_dic = {}
for j in i:
topics = df_org[df_org['ProjectName']==j]['Topics'].values[0].replace("[","").replace("]","").replace("'","").strip().split(",")
for i in topics:
if len(i)==0:
continue
i = same_things(i)
if i not in temp_dic:
temp_dic[i] = 1
else:
temp_dic[i] += 1
temp_dic = sorted(temp_dic.items(), key=lambda x: x[1], reverse=True)
# print(temp_dic[:15]) # ์์ 15๊ฐ๋ง ๋ณด์ฌ์ค
# print()
topic_dbscan_clustered_list.append(temp_dic[:15])
df_dbscan_cluster['top_15_topics'] = topic_dbscan_clustered_list
df_dbscan_cluster.to_excel("{}_DBSCAN_clusters.xlsx".format(o),index=False)
print(df_dbscan_cluster)
print("{}_DBSCAN_clusters.xlsx".format(o),"์ ์ฅ์๋ฃ")
print("*"*50)
- ๊ฒ์ํ ๊ธฐ์ ์ด๋ฆ ๊ธฐ์ค ์ ๋ณด ์์ง
topic_name = input("๊ธฐ์ ๋ช
์ ์
๋ ฅํด ์ฃผ์ธ์. : ")
url = "https://github.com/topics/{}?o=desc&s=stars".format(topic_name)
# ๋ด ์ปดํจํฐ์ User_Agent
res= requests.get(url)
res.raise_for_status()
soup=BeautifulSoup(res.text,"lxml")
soup
options = webdriver.ChromeOptions()
# headless option์
options.add_argument("headless")
browser = webdriver.Chrome("./chromedriver",options=options)
# browser = webdriver.Chrome("./chromedriver")
browser.get(url)
soup = BeautifulSoup(browser.page_source,'lxml')
# Load more ์ ๋ช๋ฒ ๋๋ฅผ๊ฒ์ธ์ง??
# ์์๋ก 100๋ฒ์ ํ์ง๋ง star์๊ฐ ์๋ฅผ๋ค์ด x๊ฐ ์ด์์ผ๋๊น์ง Load_more๋ฒํผ์ ๋๋ฅด๋ ์์ผ๋ก๋ ๊ฐ๋ฅ
Load_more_times = 100
for _ in range(Load_more_times):
prev = len(soup.find_all("article",attrs={"class":"border rounded color-shadow-small color-bg-subtle my-4"}))
try:
browser.find_element_by_xpath("//*[@id=\"js-pjax-container\"]/div[2]/div[2]/div/div[1]/form/button").click()
except:
print("End")
break
while 1:
soup = BeautifulSoup(browser.page_source,'lxml')
if prev < len(soup.find_all("article",attrs={"class":"border rounded color-shadow-small color-bg-subtle my-4"})):
prev = len(soup.find_all("article",attrs={"class":"border rounded color-shadow-small color-bg-subtle my-4"}))
break
print(prev,"๊ฐ load ์๋ฃ.")
# Topic๋ค์ Crawlingํ ๋์ด
soup = BeautifulSoup(browser.page_source,'lxml')
topics = soup.find_all("h3",attrs={"class":"f3 color-fg-muted text-normal lh-condensed"})
topic_ads = []
for i in topics:
topic_ad = "".join(i.get_text().strip().replace("\n","").split())
topic_ads.append(topic_ad)
- DBSCAN Clustering
print("{} {} DBSCAN ํด๋ฌ์คํฐ๋ง ์์ {}".format("*"*10,topic_name,"*"*10))
excel_name = "keyword({})_vectors.xlsx".format(topic_name)
df_vector = pd.read_excel(excel_name)
# eps ๊ฐ์ ์กฐ์ ํด๋๊ฐ๋ฉด์ ํด๋ฌ์คํฐ๋ง์ ํด์ผ ๋ ์ ํํ ๊ฒฐ๊ณผ๊ฐ ๋์จ๋ค
dbscan = DBSCAN(eps = 0.3)
dbscan_cluster = dbscan.fit_predict(df_vector)
dbscan_cluster
dbscan_clustered_dic = {}
dbscan_clustered_list = []
dbscan_cluster_num = len(set(dbscan_cluster))
for idx,i in enumerate(dbscan_cluster):
if i not in dbscan_clustered_dic:
dbscan_clustered_dic[i] = [df_topic['project_name'][idx]]
else:
dbscan_clustered_dic[i].append(df_topic['project_name'][idx])
# ํด๋ฌ์คํฐ๋ง์ด ๋ ํจํค์ง๋ค
# 20๊ฐ์ ๊ตฐ์ง์ผ๋ก ์์ฑ
dbscan_clustered_dic = sorted(dbscan_clustered_dic.items(), key=lambda x: x[0])
df_dbscan_cluster = pd.DataFrame(dbscan_clustered_dic,columns=['num','clusters'])
dbscan_cluster_num = [len(i) for i in df_dbscan_cluster['clusters']]
df_dbscan_cluster['cluster_num'] = dbscan_cluster_num
topic_dbscan_clustered_list = []
for i in df_dbscan_cluster['clusters']:
temp_dic = {}
for j in i:
topics = df_topic[df_topic['project_name']==j]['topic_keyword'].values[0]
for i in topics:
if len(i)==0:
continue
i = same_things(i)
if i not in temp_dic:
temp_dic[i] = 1
else:
temp_dic[i] += 1
temp_dic = sorted(temp_dic.items(), key=lambda x: x[1], reverse=True)
# print(temp_dic[:15]) # ์์ 15๊ฐ๋ง ๋ณด์ฌ์ค
# print()
topic_dbscan_clustered_list.append(temp_dic[:15])
df_dbscan_cluster['top_15_topics'] = topic_dbscan_clustered_list
df_dbscan_cluster.to_excel("{}_DBSCAN_clusters.xlsx".format(topic_name),index=False)
print(df_dbscan_cluster)
print("{}_DBSCAN_clusters.xlsx".format(topic_name),"์ ์ฅ์๋ฃ")
print("*"*50)











