gperdrizet commited on
Commit
84cdb3a
·
verified ·
1 Parent(s): 60bdf31

Added README text GitHub repo data retreival

Browse files
Files changed (2) hide show
  1. functions/github.py +97 -1
  2. tests/test_github.py +227 -12
functions/github.py CHANGED
@@ -12,6 +12,7 @@ from pathlib import Path
12
  from datetime import datetime
13
 
14
  import requests
 
15
 
16
  # pylint: disable=broad-exception-caught
17
 
@@ -36,7 +37,8 @@ def get_github_repositories(username: str) -> list:
36
  "forks": 2,
37
  "updated_at": "2024-01-01T00:00:00Z",
38
  "html_url": "https://github.com/user/repo",
39
- "topics": ["python", "api"]
 
40
  }
41
  ]
42
  """
@@ -185,6 +187,14 @@ def _process_repository_data(repos: List[Dict]) -> List[Dict]:
185
  "size": repo.get("size", 0)
186
  }
187
 
 
 
 
 
 
 
 
 
188
  processed.append(processed_repo)
189
 
190
  except Exception as e:
@@ -192,3 +202,89 @@ def _process_repository_data(repos: List[Dict]) -> List[Dict]:
192
  continue
193
 
194
  return processed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  from datetime import datetime
13
 
14
  import requests
15
+ import base64
16
 
17
  # pylint: disable=broad-exception-caught
18
 
 
37
  "forks": 2,
38
  "updated_at": "2024-01-01T00:00:00Z",
39
  "html_url": "https://github.com/user/repo",
40
+ "topics": ["python", "api"],
41
+ "readme": "# Project Title\n\nProject description..."
42
  }
43
  ]
44
  """
 
187
  "size": repo.get("size", 0)
188
  }
189
 
190
+ # Get README content for the repository
191
+ repo_url = repo.get("html_url", "")
192
+ if repo_url:
193
+ readme_content = get_repository_readme(repo_url)
194
+ processed_repo["readme"] = readme_content
195
+ else:
196
+ processed_repo["readme"] = ""
197
+
198
  processed.append(processed_repo)
199
 
200
  except Exception as e:
 
202
  continue
203
 
204
  return processed
205
+
206
+
207
+ def get_repository_readme(repo_url: str) -> str:
208
+ """
209
+ Get the fulltext content of a repository's README file.
210
+
211
+ Args:
212
+ repo_url (str): GitHub repository URL (e.g., "https://github.com/owner/repo")
213
+
214
+ Returns:
215
+ str: README file content as text, or empty string if not found/error
216
+
217
+ Example:
218
+ >>> readme_content = get_repository_readme("https://github.com/owner/repo")
219
+ >>> print(readme_content[:100])
220
+ # My Project
221
+
222
+ This is a sample project that does...
223
+ """
224
+
225
+ logger = logging.getLogger(f'{__name__}.get_repository_readme')
226
+
227
+ try:
228
+ # Extract owner and repo name from URL
229
+ if not repo_url.startswith("https://github.com/"):
230
+ logger.error("Invalid GitHub URL format: %s", repo_url)
231
+ return ""
232
+
233
+ # Remove trailing slash and split
234
+ repo_url = repo_url.rstrip("/")
235
+ parts = repo_url.replace("https://github.com/", "").split("/")
236
+
237
+ if len(parts) != 2:
238
+ logger.error("Invalid GitHub URL format, expected owner/repo: %s", repo_url)
239
+ return ""
240
+
241
+ owner, repo = parts
242
+
243
+ logger.info("Fetching README for repository: %s/%s", owner, repo)
244
+
245
+ # GitHub API endpoint for README
246
+ api_url = f"https://api.github.com/repos/{owner}/{repo}/readme"
247
+
248
+ headers = {
249
+ "Accept": "application/vnd.github.v3+json",
250
+ "User-Agent": "Resumate-App/1.0"
251
+ }
252
+
253
+ response = requests.get(api_url, headers=headers, timeout=10)
254
+
255
+ if response.status_code == 404:
256
+ logger.info("No README file found for repository: %s/%s", owner, repo)
257
+ return ""
258
+
259
+ if response.status_code != 200:
260
+ logger.error("GitHub API error fetching README: %s", response.status_code)
261
+ return ""
262
+
263
+ readme_data = response.json()
264
+
265
+ # README content is base64 encoded
266
+ if "content" not in readme_data:
267
+ logger.error("README API response missing content field")
268
+ return ""
269
+
270
+ # Decode base64 content
271
+ encoded_content = readme_data["content"]
272
+ # Remove any whitespace/newlines from base64 string
273
+ encoded_content = encoded_content.replace("\n", "").replace(" ", "")
274
+
275
+ try:
276
+ decoded_content = base64.b64decode(encoded_content).decode('utf-8')
277
+ logger.info("Successfully retrieved README content (%d characters)", len(decoded_content))
278
+ return decoded_content
279
+
280
+ except Exception as decode_error:
281
+ logger.error("Error decoding README content: %s", str(decode_error))
282
+ return ""
283
+
284
+ except requests.RequestException as e:
285
+ logger.error("Network error fetching README: %s", str(e))
286
+ return ""
287
+
288
+ except Exception as e:
289
+ logger.error("Error retrieving README: %s", str(e))
290
+ return ""
tests/test_github.py CHANGED
@@ -5,6 +5,7 @@ Unit tests for the github module.
5
  import unittest
6
  from unittest.mock import patch, MagicMock
7
  import requests
 
8
  from functions import github
9
 
10
  # pylint: disable=protected-access
@@ -44,7 +45,8 @@ class TestGetGitHubRepositories(unittest.TestCase):
44
  "created_at": "2024-01-01T00:00:00Z",
45
  "html_url": "https://github.com/user/test-repo",
46
  "topics": ["python", "test"],
47
- "size": 100
 
48
  }
49
  ]
50
 
@@ -222,8 +224,11 @@ class TestGetUserRepositories(unittest.TestCase):
222
  class TestProcessRepositoryData(unittest.TestCase):
223
  """Test cases for the _process_repository_data function."""
224
 
225
- def test_basic_processing(self):
 
226
  """Test basic repository data processing."""
 
 
227
  raw_repos = [
228
  {
229
  "name": "test-repo",
@@ -254,24 +259,34 @@ class TestProcessRepositoryData(unittest.TestCase):
254
  self.assertEqual(processed_repo["html_url"], "https://github.com/user/test-repo")
255
  self.assertEqual(processed_repo["topics"], ["python", "test"])
256
  self.assertEqual(processed_repo["size"], 100)
 
 
 
 
257
 
258
- def test_fork_filtering(self):
 
259
  """Test filtering of unmodified forks."""
 
 
260
  raw_repos = [
261
  {
262
  "name": "original-repo",
263
  "fork": False,
264
- "stargazers_count": 5
 
265
  },
266
  {
267
  "name": "unmodified-fork",
268
  "fork": True,
269
- "stargazers_count": 0
 
270
  },
271
  {
272
  "name": "modified-fork",
273
  "fork": True,
274
- "stargazers_count": 3
 
275
  }
276
  ]
277
 
@@ -283,9 +298,15 @@ class TestProcessRepositoryData(unittest.TestCase):
283
  self.assertIn("original-repo", repo_names)
284
  self.assertIn("modified-fork", repo_names)
285
  self.assertNotIn("unmodified-fork", repo_names)
 
 
 
286
 
287
- def test_missing_fields(self):
 
288
  """Test handling of missing fields in repository data."""
 
 
289
  raw_repos = [
290
  {
291
  "name": "minimal-repo"
@@ -307,20 +328,29 @@ class TestProcessRepositoryData(unittest.TestCase):
307
  self.assertEqual(processed_repo["html_url"], "")
308
  self.assertEqual(processed_repo["topics"], [])
309
  self.assertEqual(processed_repo["size"], 0)
 
 
 
 
310
 
311
- def test_processing_error_handling(self):
 
312
  """Test handling of processing errors for individual repos."""
 
 
313
  # Create a repo dict that will cause an error during processing
314
  raw_repos = [
315
  {
316
  "name": "good-repo",
317
- "stargazers_count": 5
 
318
  },
319
  # This will cause an AttributeError when trying to call .get() on None
320
  None,
321
  {
322
- "name": "another-good-repo",
323
- "stargazers_count": 3
 
324
  }
325
  ]
326
 
@@ -332,11 +362,34 @@ class TestProcessRepositoryData(unittest.TestCase):
332
  with self.assertRaises(AttributeError):
333
  github._process_repository_data(raw_repos)
334
 
335
- def test_empty_repository_list(self):
 
336
  """Test processing of empty repository list."""
337
  result = github._process_repository_data([])
338
 
339
  self.assertEqual(result, [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
  def test_all_forks_filtered(self):
342
  """Test when all repositories are unmodified forks."""
@@ -358,5 +411,167 @@ class TestProcessRepositoryData(unittest.TestCase):
358
  self.assertEqual(result, [])
359
 
360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  if __name__ == '__main__':
362
  unittest.main()
 
5
  import unittest
6
  from unittest.mock import patch, MagicMock
7
  import requests
8
+ import base64
9
  from functions import github
10
 
11
  # pylint: disable=protected-access
 
45
  "created_at": "2024-01-01T00:00:00Z",
46
  "html_url": "https://github.com/user/test-repo",
47
  "topics": ["python", "test"],
48
+ "size": 100,
49
+ "readme": "# Test Repository\n\nThis is a test README."
50
  }
51
  ]
52
 
 
224
  class TestProcessRepositoryData(unittest.TestCase):
225
  """Test cases for the _process_repository_data function."""
226
 
227
+ @patch('functions.github.get_repository_readme')
228
+ def test_basic_processing(self, mock_get_readme):
229
  """Test basic repository data processing."""
230
+ mock_get_readme.return_value = "# Test Repository\n\nThis is a test README."
231
+
232
  raw_repos = [
233
  {
234
  "name": "test-repo",
 
259
  self.assertEqual(processed_repo["html_url"], "https://github.com/user/test-repo")
260
  self.assertEqual(processed_repo["topics"], ["python", "test"])
261
  self.assertEqual(processed_repo["size"], 100)
262
+ self.assertEqual(processed_repo["readme"], "# Test Repository\n\nThis is a test README.")
263
+
264
+ # Verify README was fetched
265
+ mock_get_readme.assert_called_once_with("https://github.com/user/test-repo")
266
 
267
+ @patch('functions.github.get_repository_readme')
268
+ def test_fork_filtering(self, mock_get_readme):
269
  """Test filtering of unmodified forks."""
270
+ mock_get_readme.return_value = "# Repository README"
271
+
272
  raw_repos = [
273
  {
274
  "name": "original-repo",
275
  "fork": False,
276
+ "stargazers_count": 5,
277
+ "html_url": "https://github.com/user/original-repo"
278
  },
279
  {
280
  "name": "unmodified-fork",
281
  "fork": True,
282
+ "stargazers_count": 0,
283
+ "html_url": "https://github.com/user/unmodified-fork"
284
  },
285
  {
286
  "name": "modified-fork",
287
  "fork": True,
288
+ "stargazers_count": 3,
289
+ "html_url": "https://github.com/user/modified-fork"
290
  }
291
  ]
292
 
 
298
  self.assertIn("original-repo", repo_names)
299
  self.assertIn("modified-fork", repo_names)
300
  self.assertNotIn("unmodified-fork", repo_names)
301
+
302
+ # Verify README was fetched for included repos only
303
+ self.assertEqual(mock_get_readme.call_count, 2)
304
 
305
+ @patch('functions.github.get_repository_readme')
306
+ def test_missing_fields(self, mock_get_readme):
307
  """Test handling of missing fields in repository data."""
308
+ mock_get_readme.return_value = ""
309
+
310
  raw_repos = [
311
  {
312
  "name": "minimal-repo"
 
328
  self.assertEqual(processed_repo["html_url"], "")
329
  self.assertEqual(processed_repo["topics"], [])
330
  self.assertEqual(processed_repo["size"], 0)
331
+ self.assertEqual(processed_repo["readme"], "")
332
+
333
+ # Verify README function was NOT called since there's no URL
334
+ mock_get_readme.assert_not_called()
335
 
336
+ @patch('functions.github.get_repository_readme')
337
+ def test_processing_error_handling(self, mock_get_readme):
338
  """Test handling of processing errors for individual repos."""
339
+ mock_get_readme.return_value = "README content"
340
+
341
  # Create a repo dict that will cause an error during processing
342
  raw_repos = [
343
  {
344
  "name": "good-repo",
345
+ "stargazers_count": 5,
346
+ "html_url": "https://github.com/user/good-repo"
347
  },
348
  # This will cause an AttributeError when trying to call .get() on None
349
  None,
350
  {
351
+ "name": "another-good-repo",
352
+ "stargazers_count": 3,
353
+ "html_url": "https://github.com/user/another-good-repo"
354
  }
355
  ]
356
 
 
362
  with self.assertRaises(AttributeError):
363
  github._process_repository_data(raw_repos)
364
 
365
+ @patch('functions.github.get_repository_readme')
366
+ def test_empty_repository_list(self, mock_get_readme):
367
  """Test processing of empty repository list."""
368
  result = github._process_repository_data([])
369
 
370
  self.assertEqual(result, [])
371
+ # Verify no README calls were made
372
+ mock_get_readme.assert_not_called()
373
+
374
+ @patch('functions.github.get_repository_readme')
375
+ def test_readme_retrieval_error_handling(self, mock_get_readme):
376
+ """Test handling when README retrieval fails."""
377
+ # Simulate README function returning empty string (error case)
378
+ mock_get_readme.return_value = ""
379
+
380
+ raw_repos = [
381
+ {
382
+ "name": "test-repo",
383
+ "html_url": "https://github.com/user/test-repo",
384
+ "stargazers_count": 5
385
+ }
386
+ ]
387
+
388
+ result = github._process_repository_data(raw_repos)
389
+
390
+ self.assertEqual(len(result), 1)
391
+ self.assertEqual(result[0]["readme"], "")
392
+ mock_get_readme.assert_called_once_with("https://github.com/user/test-repo")
393
 
394
  def test_all_forks_filtered(self):
395
  """Test when all repositories are unmodified forks."""
 
411
  self.assertEqual(result, [])
412
 
413
 
414
+ class TestGetRepositoryReadme(unittest.TestCase):
415
+ """Test cases for the get_repository_readme function."""
416
+
417
+ @patch('requests.get')
418
+ def test_successful_readme_retrieval(self, mock_get):
419
+ """Test successful README file retrieval."""
420
+ readme_content = "# Test Repository\n\nThis is a test README file."
421
+ encoded_content = base64.b64encode(readme_content.encode('utf-8')).decode('ascii')
422
+
423
+ mock_response = MagicMock()
424
+ mock_response.status_code = 200
425
+ mock_response.json.return_value = {
426
+ "content": encoded_content,
427
+ "encoding": "base64"
428
+ }
429
+ mock_get.return_value = mock_response
430
+
431
+ result = github.get_repository_readme("https://github.com/owner/repo")
432
+
433
+ self.assertEqual(result, readme_content)
434
+ mock_get.assert_called_once()
435
+ call_args = mock_get.call_args
436
+ self.assertIn("https://api.github.com/repos/owner/repo/readme", call_args[0][0])
437
+ self.assertEqual(call_args[1]["headers"]["User-Agent"], "Resumate-App/1.0")
438
+
439
+ @patch('requests.get')
440
+ def test_readme_not_found(self, mock_get):
441
+ """Test handling when README file doesn't exist."""
442
+ mock_response = MagicMock()
443
+ mock_response.status_code = 404
444
+ mock_get.return_value = mock_response
445
+
446
+ result = github.get_repository_readme("https://github.com/owner/repo")
447
+
448
+ self.assertEqual(result, "")
449
+
450
+ @patch('requests.get')
451
+ def test_api_error(self, mock_get):
452
+ """Test handling of API errors."""
453
+ mock_response = MagicMock()
454
+ mock_response.status_code = 500
455
+ mock_get.return_value = mock_response
456
+
457
+ result = github.get_repository_readme("https://github.com/owner/repo")
458
+
459
+ self.assertEqual(result, "")
460
+
461
+ @patch('requests.get')
462
+ def test_network_error(self, mock_get):
463
+ """Test handling of network errors."""
464
+ mock_get.side_effect = requests.RequestException("Connection error")
465
+
466
+ result = github.get_repository_readme("https://github.com/owner/repo")
467
+
468
+ self.assertEqual(result, "")
469
+
470
+ def test_invalid_url_format(self):
471
+ """Test handling of invalid URL formats."""
472
+ invalid_urls = [
473
+ "https://gitlab.com/owner/repo",
474
+ "https://github.com/owner",
475
+ "https://github.com/owner/repo/extra/path",
476
+ "not-a-url",
477
+ "",
478
+ "https://github.com/"
479
+ ]
480
+
481
+ for url in invalid_urls:
482
+ with self.subTest(url=url):
483
+ result = github.get_repository_readme(url)
484
+ self.assertEqual(result, "")
485
+
486
+ @patch('requests.get')
487
+ def test_missing_content_field(self, mock_get):
488
+ """Test handling when API response is missing content field."""
489
+ mock_response = MagicMock()
490
+ mock_response.status_code = 200
491
+ mock_response.json.return_value = {
492
+ "encoding": "base64"
493
+ # Missing "content" field
494
+ }
495
+ mock_get.return_value = mock_response
496
+
497
+ result = github.get_repository_readme("https://github.com/owner/repo")
498
+
499
+ self.assertEqual(result, "")
500
+
501
+ @patch('requests.get')
502
+ def test_invalid_base64_content(self, mock_get):
503
+ """Test handling of invalid base64 content."""
504
+ mock_response = MagicMock()
505
+ mock_response.status_code = 200
506
+ mock_response.json.return_value = {
507
+ "content": "invalid-base64-content!@#$",
508
+ "encoding": "base64"
509
+ }
510
+ mock_get.return_value = mock_response
511
+
512
+ result = github.get_repository_readme("https://github.com/owner/repo")
513
+
514
+ self.assertEqual(result, "")
515
+
516
+ @patch('requests.get')
517
+ def test_unicode_readme_content(self, mock_get):
518
+ """Test handling of README with Unicode characters."""
519
+ readme_content = "# Test 🚀\n\nEmoji and unicode: 中文 русский"
520
+ encoded_content = base64.b64encode(readme_content.encode('utf-8')).decode('ascii')
521
+
522
+ mock_response = MagicMock()
523
+ mock_response.status_code = 200
524
+ mock_response.json.return_value = {
525
+ "content": encoded_content,
526
+ "encoding": "base64"
527
+ }
528
+ mock_get.return_value = mock_response
529
+
530
+ result = github.get_repository_readme("https://github.com/owner/repo")
531
+
532
+ self.assertEqual(result, readme_content)
533
+
534
+ @patch('requests.get')
535
+ def test_large_readme_content(self, mock_get):
536
+ """Test handling of large README files."""
537
+ # Create a large README content
538
+ readme_content = "# Large README\n\n" + "This is a line of content.\n" * 1000
539
+ encoded_content = base64.b64encode(readme_content.encode('utf-8')).decode('ascii')
540
+
541
+ mock_response = MagicMock()
542
+ mock_response.status_code = 200
543
+ mock_response.json.return_value = {
544
+ "content": encoded_content,
545
+ "encoding": "base64"
546
+ }
547
+ mock_get.return_value = mock_response
548
+
549
+ result = github.get_repository_readme("https://github.com/owner/repo")
550
+
551
+ self.assertEqual(result, readme_content)
552
+ self.assertGreater(len(result), 10000) # Verify it's actually large
553
+
554
+ @patch('requests.get')
555
+ def test_url_with_trailing_slash(self, mock_get):
556
+ """Test handling of URLs with trailing slash."""
557
+ readme_content = "# Test README"
558
+ encoded_content = base64.b64encode(readme_content.encode('utf-8')).decode('ascii')
559
+
560
+ mock_response = MagicMock()
561
+ mock_response.status_code = 200
562
+ mock_response.json.return_value = {
563
+ "content": encoded_content,
564
+ "encoding": "base64"
565
+ }
566
+ mock_get.return_value = mock_response
567
+
568
+ result = github.get_repository_readme("https://github.com/owner/repo/")
569
+
570
+ self.assertEqual(result, readme_content)
571
+ # Verify the API call used the correct URL without trailing slash
572
+ call_args = mock_get.call_args
573
+ self.assertIn("https://api.github.com/repos/owner/repo/readme", call_args[0][0])
574
+
575
+
576
  if __name__ == '__main__':
577
  unittest.main()