File size: 7,278 Bytes
f7ab812 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Converted April 2024/13_03_24 - UK cancer study shows big fall in death rates since early 1990s.docx to april2024/13_03_24 - UK cancer study shows big fall in death rates since early 1990s.txt\n",
"Converted April 2024/15_03_24 - Health workers’ unions call for Frank Hester to lose NHS contracts.docx to april2024/15_03_24 - Health workers’ unions call for Frank Hester to lose NHS contracts.txt\n",
"Converted April 2024/12_03_24 - Covid vaccines cut risk of virus-related heart failure and blood clots, study finds.docx to april2024/12_03_24 - Covid vaccines cut risk of virus-related heart failure and blood clots, study finds.txt\n",
"Converted April 2024/15_03_24 - Macmillan Cancer Support to axe 150 jobs amid soaring inflation.docx to april2024/15_03_24 - Macmillan Cancer Support to axe 150 jobs amid soaring inflation.txt\n",
"Converted April 2024/15_03_24 - NHS hospitals hit by shortage of life-saving drug for chronic breathing issues.docx to april2024/15_03_24 - NHS hospitals hit by shortage of life-saving drug for chronic breathing issues.txt\n",
"Converted April 2024/14_03_24 - Thank you, NHS England, for offering baby-loss leave.docx to april2024/14_03_24 - Thank you, NHS England, for offering baby-loss leave.txt\n",
"Converted April 2024/11_03_24 - UK report reveals bias within medical tools and devices.docx to april2024/11_03_24 - UK report reveals bias within medical tools and devices.txt\n",
"Converted April 2024/12_03_24 - Bereaved families continue to wait for Essex mental health inquiry.docx to april2024/12_03_24 - Bereaved families continue to wait for Essex mental health inquiry.txt\n",
"Converted April 2024/11_03_24 - An agonising wait for surgery – and NHS rationing – forced me to go private.docx to april2024/11_03_24 - An agonising wait for surgery – and NHS rationing – forced me to go private.txt\n",
"Converted April 2024/12_03_24 - Overweight girls ‘more likely to see GP about musculoskeletal problems’.docx to april2024/12_03_24 - Overweight girls ‘more likely to see GP about musculoskeletal problems’.txt\n",
"Converted April 2024/14_03_24 - 20,000 people off work in the UK every month for mental ill health.docx to april2024/14_03_24 - 20,000 people off work in the UK every month for mental ill health.txt\n",
"Converted April 2024/08_03_24 - Private healthcare could become ‘a new normal’ as NHS grows weaker.docx to april2024/08_03_24 - Private healthcare could become ‘a new normal’ as NHS grows weaker.txt\n",
"Converted April 2024/13_03_24 - Air pollution levels have improved in Europe over 20 years, say researchers.docx to april2024/13_03_24 - Air pollution levels have improved in Europe over 20 years, say researchers.txt\n",
"Converted April 2024/14_03_24 - Neurological conditions now leading cause of ill-health worldwide, finds study.docx to april2024/14_03_24 - Neurological conditions now leading cause of ill-health worldwide, finds study.txt\n",
"Converted April 2024/09_03_24 - Cancer patients may be at risk due to overstretched NHS staff, ombudsman warns.docx to april2024/09_03_24 - Cancer patients may be at risk due to overstretched NHS staff, ombudsman warns.txt\n",
"Converted April 2024/08_03_24 - ‘My GP suggested it’_ Britons explain why they went private for surgery.docx to april2024/08_03_24 - ‘My GP suggested it’_ Britons explain why they went private for surgery.txt\n",
"Converted April 2024/13_03_24 - The desperate struggle to access NHS treatment.docx to april2024/13_03_24 - The desperate struggle to access NHS treatment.txt\n",
"Converted April 2024/10_03_24 - Warning over use in UK of unregulated AI chatbots to create social care plans.docx to april2024/10_03_24 - Warning over use in UK of unregulated AI chatbots to create social care plans.txt\n",
"Converted April 2024/15_03_24 - Why ME_CFS is still so poorly researched and treated.docx to april2024/15_03_24 - Why ME_CFS is still so poorly researched and treated.txt\n",
"Converted April 2024/08_03_24 - Cancer-causing PCB chemicals still being produced despite 40-year-old ban.docx to april2024/08_03_24 - Cancer-causing PCB chemicals still being produced despite 40-year-old ban.txt\n",
"Converted April 2024/11_03_24 - The Guardian view on the privatisation of health_ outsourcing will not save the NHS.docx to april2024/11_03_24 - The Guardian view on the privatisation of health_ outsourcing will not save the NHS.txt\n",
"Converted April 2024/08_03_24 - Private hospitals ‘cannibalising’ NHS in England by doing 10_ of elective operations.docx to april2024/08_03_24 - Private hospitals ‘cannibalising’ NHS in England by doing 10_ of elective operations.txt\n",
"Converted April 2024/10_03_24 - The Price of Life by Jenny Kleeman review – the uncomfortable cost of living.docx to april2024/10_03_24 - The Price of Life by Jenny Kleeman review – the uncomfortable cost of living.txt\n",
"Converted April 2024/11_03_24 - Covid bereaved accuse former Welsh health minister of incompetence.docx to april2024/11_03_24 - Covid bereaved accuse former Welsh health minister of incompetence.txt\n"
]
}
],
"source": [
"from docx import Document\n",
"import os\n",
"def convert_docx_to_txt(input_path, output_path):\n",
" \"\"\"\n",
" Converts a .docx file to a plain .txt file.\n",
" \n",
" Args:\n",
" input_path (str): Path to the input .docx file.\n",
" output_path (str): Path to the output .txt file.\n",
" \"\"\"\n",
" # Load the .docx file\n",
" doc = Document(input_path)\n",
" \n",
" # Extract all text paragraphs\n",
" text = []\n",
" for paragraph in doc.paragraphs:\n",
" text.append(paragraph.text)\n",
" \n",
" # Save the text to a .txt file\n",
" with open(output_path, 'w', encoding='utf-8') as txt_file:\n",
" txt_file.write('\\n'.join(text))\n",
" \n",
" print(f\"Converted {input_path} to {output_path}\")\n",
"\n",
"os.system(\"mkdir -p april2024\")\n",
"input_paths = [file for file in os.listdir(\"April 2024\") if file.endswith(\".docx\")]\n",
"output_paths = [f\"april2024/{file.replace('.docx', '.txt')}\" for file in input_paths]\n",
"for input_path, output_path in zip(input_paths, output_paths):\n",
" convert_docx_to_txt(f\"April 2024/{input_path}\", output_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|