1 /* 2 * Copyright 2006-2007, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Galante, haiku.galante@gmail.com 7 * Axel Dörfler, axeld@pinc-software.de 8 * Hugo Santos, hugosantos@gmail.com 9 */ 10 11 12 #include "TCPEndpoint.h" 13 #include "EndpointManager.h" 14 15 #include <net_buffer.h> 16 #include <net_datalink.h> 17 #include <net_stat.h> 18 #include <NetBufferUtilities.h> 19 #include <NetUtilities.h> 20 21 #include <lock.h> 22 #include <util/AutoLock.h> 23 #include <util/khash.h> 24 #include <util/list.h> 25 26 #include <KernelExport.h> 27 #include <Select.h> 28 29 #include <netinet/in.h> 30 #include <netinet/ip.h> 31 #include <netinet/tcp.h> 32 #include <new> 33 #include <stdlib.h> 34 #include <string.h> 35 36 37 // References: 38 // - RFC 793 - Transmission Control Protocol 39 // - RFC 813 - Window and Acknowledgement Strategy in TCP 40 // 41 // Things this implementation currently doesn't implement: 42 // 43 // TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, RFC 2001, RFC 2581, RFC 3042 44 // NewReno Modification to TCP's Fast Recovery, RFC 2582 45 // Explicit Congestion Notification (ECN), RFC 3168 46 // SYN-Cache 47 // TCP Extensions for High Performance, RFC 1323 48 // SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517 49 // Forward RTO-Recovery, RFC 4138 50 51 #define PrintAddress(address) \ 52 AddressString(Domain(), address, true).Data() 53 54 //#define TRACE_TCP 55 //#define PROBE_TCP 56 57 #ifdef TRACE_TCP 58 // the space before ', ##args' is important in order for this to work with cpp 2.95 59 # define TRACE(format, args...) dprintf("TCP [%llu] %p (%12s) " format "\n", \ 60 system_time(), this, name_for_state(fState) , ##args) 61 #else 62 # define TRACE(args...) do { } while (0) 63 #endif 64 65 #ifdef PROBE_TCP 66 # define PROBE(buffer, window) \ 67 dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \ 68 system_time(), PrintAddress(buffer->source), \ 69 PrintAddress(buffer->destination), buffer->size, (uint32)fSendNext, \ 70 (uint32)fSendUnacknowledged, fCongestionWindow, fSlowStartThreshold, \ 71 window, fSendWindow, (uint32)(fSendMax - fSendUnacknowledged), \ 72 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout) 73 #else 74 # define PROBE(buffer, window) do { } while (0) 75 #endif 76 77 // Initial estimate for packet round trip time (RTT) 78 #define TCP_INITIAL_RTT 2000000 79 80 // constants for the fFlags field 81 enum { 82 FLAG_OPTION_WINDOW_SCALE = 0x01, 83 FLAG_OPTION_TIMESTAMP = 0x02, 84 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections? 85 // That is, what is expected from accept() after a shutdown() 86 // is performed on a listen()ing socket. 87 FLAG_NO_RECEIVE = 0x04, 88 }; 89 90 91 static const int kTimestampFactor = 1024; 92 93 94 static inline bigtime_t 95 absolute_timeout(bigtime_t timeout) 96 { 97 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT) 98 return timeout; 99 100 return timeout + system_time(); 101 } 102 103 104 static inline status_t 105 posix_error(status_t error) 106 { 107 if (error == B_TIMED_OUT) 108 return B_WOULD_BLOCK; 109 110 return error; 111 } 112 113 114 static inline bool 115 in_window(const tcp_sequence &sequence, const tcp_sequence &rcvNext, 116 uint32 rcvWindow) 117 { 118 return sequence >= rcvNext && sequence < (rcvNext + rcvWindow); 119 } 120 121 122 static inline bool 123 segment_in_sequence(const tcp_segment_header &segment, int size, 124 const tcp_sequence &rcvNext, uint32 rcvWindow) 125 { 126 tcp_sequence sequence(segment.sequence); 127 if (size == 0) { 128 if (rcvWindow == 0) 129 return sequence == rcvNext; 130 return in_window(sequence, rcvNext, rcvWindow); 131 } else { 132 if (rcvWindow == 0) 133 return false; 134 return in_window(sequence, rcvNext, rcvWindow) 135 || in_window(sequence + size - 1, rcvNext, rcvWindow); 136 } 137 } 138 139 140 static inline bool 141 is_writable(tcp_state state) 142 { 143 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED 144 || state == ESTABLISHED || state == FINISH_RECEIVED; 145 } 146 147 148 static inline uint32 tcp_now() 149 { 150 return system_time() / kTimestampFactor; 151 } 152 153 154 static inline uint32 tcp_diff_timestamp(uint32 base) 155 { 156 uint32 now = tcp_now(); 157 158 if (now > base) 159 return now - base; 160 161 return now + UINT_MAX - base; 162 } 163 164 165 static inline bool 166 state_needs_finish(int32 state) 167 { 168 return state == WAIT_FOR_FINISH_ACKNOWLEDGE 169 || state == FINISH_SENT || state == CLOSING; 170 } 171 172 173 WaitList::WaitList(const char *name) 174 { 175 fCondition = 0; 176 fSem = create_sem(0, name); 177 } 178 179 180 WaitList::~WaitList() 181 { 182 delete_sem(fSem); 183 } 184 185 186 status_t 187 WaitList::InitCheck() const 188 { 189 return fSem; 190 } 191 192 193 status_t 194 WaitList::Wait(MutexLocker &locker, bigtime_t timeout, bool wakeNext) 195 { 196 locker.Unlock(); 197 198 status_t status = B_OK; 199 200 while (status == B_OK && !atomic_test_and_set(&fCondition, 0, 1)) { 201 status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, 202 timeout); 203 } 204 205 locker.Lock(); 206 if (status == B_OK && wakeNext) 207 Signal(); 208 209 return status; 210 } 211 212 213 void 214 WaitList::Signal() 215 { 216 atomic_or(&fCondition, 1); 217 #ifdef __HAIKU__ 218 release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_IF_WAITING_ONLY); 219 #else 220 int32 count; 221 if (get_sem_count(fSem, &count) == B_OK && count < 0) 222 release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE); 223 #endif 224 } 225 226 227 TCPEndpoint::TCPEndpoint(net_socket *socket) 228 : 229 ProtocolSocket(socket), 230 fManager(NULL), 231 fReceiveList("tcp receive"), 232 fSendList("tcp send"), 233 fOptions(0), 234 fSendWindowShift(0), 235 fReceiveWindowShift(0), 236 fSendUnacknowledged(0), 237 fSendNext(0), 238 fSendMax(0), 239 fSendWindow(0), 240 fSendMaxWindow(0), 241 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 242 fSendQueue(socket->send.buffer_size), 243 fInitialSendSequence(0), 244 fDuplicateAcknowledgeCount(0), 245 fRoute(NULL), 246 fReceiveNext(0), 247 fReceiveMaxAdvertised(0), 248 fReceiveWindow(socket->receive.buffer_size), 249 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 250 fReceiveQueue(socket->receive.buffer_size), 251 fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor), 252 fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor), 253 fRetransmitTimeout(TCP_INITIAL_RTT), 254 fReceivedTimestamp(0), 255 fCongestionWindow(0), 256 fSlowStartThreshold(0), 257 fState(CLOSED), 258 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP), 259 fError(B_OK) 260 { 261 //gStackModule->init_timer(&fTimer, _TimeWait, this); 262 263 // TODO: to be replaced with a real locking strategy! 264 mutex_init(&fLock, "tcp lock"); 265 266 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this); 267 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, this); 268 gStackModule->init_timer(&fDelayedAcknowledgeTimer, 269 TCPEndpoint::_DelayedAcknowledgeTimer, this); 270 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, this); 271 } 272 273 274 TCPEndpoint::~TCPEndpoint() 275 { 276 mutex_lock(&fLock); 277 278 gStackModule->cancel_timer(&fRetransmitTimer); 279 gStackModule->cancel_timer(&fPersistTimer); 280 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 281 gStackModule->cancel_timer(&fTimeWaitTimer); 282 283 if (fManager) { 284 fManager->Unbind(this); 285 return_endpoint_manager(fManager); 286 } 287 288 mutex_destroy(&fLock); 289 } 290 291 292 status_t 293 TCPEndpoint::InitCheck() const 294 { 295 if (fLock.sem < B_OK) 296 return fLock.sem; 297 298 if (fReceiveList.InitCheck() < B_OK) 299 return fReceiveList.InitCheck(); 300 301 if (fSendList.InitCheck() < B_OK) 302 return fSendList.InitCheck(); 303 304 return B_OK; 305 } 306 307 308 // #pragma mark - protocol API 309 310 311 status_t 312 TCPEndpoint::Open() 313 { 314 TRACE("Open()"); 315 316 status_t status = ProtocolSocket::Open(); 317 if (status < B_OK) 318 return status; 319 320 fManager = create_endpoint_manager(Domain()); 321 if (fManager == NULL) 322 return EAFNOSUPPORT; 323 324 return B_OK; 325 } 326 327 328 status_t 329 TCPEndpoint::Close() 330 { 331 TRACE("Close()"); 332 333 MutexLocker lock(fLock); 334 335 if (fState == LISTEN) 336 delete_sem(fAcceptSemaphore); 337 338 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) { 339 fState = CLOSED; 340 return B_OK; 341 } 342 343 status_t status = _ShutdownEgress(true); 344 if (status != B_OK) 345 return status; 346 347 if (socket->options & SO_LINGER) { 348 TRACE("Close(): Lingering for %i secs", socket->linger); 349 350 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL); 351 352 while (fSendQueue.Used() > 0) { 353 status = fSendList.Wait(lock, maximum); 354 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK) 355 break; 356 else if (status < B_OK) 357 return status; 358 } 359 360 TRACE("Close(): after waiting, the SendQ was left with %lu bytes.", 361 fSendQueue.Used()); 362 } 363 364 // TODO: do i need to wait until fState returns to CLOSED? 365 return B_OK; 366 } 367 368 369 status_t 370 TCPEndpoint::Free() 371 { 372 TRACE("Free()"); 373 374 MutexLocker _(fLock); 375 376 if (fState <= SYNCHRONIZE_SENT || fState == TIME_WAIT) 377 return B_OK; 378 379 // we are only interested in the timer, not in changing state 380 _EnterTimeWait(); 381 return B_BUSY; 382 // we'll be freed later when the 2MSL timer expires 383 } 384 385 386 /*! 387 Creates and sends a synchronize packet to /a address, and then waits 388 until the connection has been established or refused. 389 */ 390 status_t 391 TCPEndpoint::Connect(const sockaddr *address) 392 { 393 TRACE("Connect() on address %s", PrintAddress(address)); 394 395 MutexLocker locker(fLock); 396 397 // Can only call connect() from CLOSED or LISTEN states 398 // otherwise endpoint is considered already connected 399 if (fState == LISTEN) { 400 // this socket is about to connect; remove pending connections in the backlog 401 gSocketModule->set_max_backlog(socket, 0); 402 } else if (fState != CLOSED) 403 return EISCONN; 404 405 status_t status = _PrepareSendPath(address); 406 if (status < B_OK) 407 return status; 408 409 TRACE(" Connect(): starting 3-way handshake..."); 410 411 fState = SYNCHRONIZE_SENT; 412 413 // send SYN 414 status = _SendQueued(); 415 if (status != B_OK) { 416 fState = CLOSED; 417 return status; 418 } 419 420 // If we are running over Loopback, after _SendQueued() returns we 421 // may be in ESTABLISHED already. 422 if (fState == ESTABLISHED) { 423 TRACE(" Connect() completed after _SendQueued()"); 424 return B_OK; 425 } 426 427 // wait until 3-way handshake is complete (if needed) 428 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT); 429 if (timeout == 0) { 430 // we're a non-blocking socket 431 TRACE(" Connect() delayed, return EINPROGRESS"); 432 return EINPROGRESS; 433 } 434 435 status = _WaitForEstablished(locker, absolute_timeout(timeout)); 436 TRACE(" Connect(): Connection complete: %s (timeout was %llu)", 437 strerror(status), timeout); 438 return posix_error(status); 439 } 440 441 442 status_t 443 TCPEndpoint::Accept(struct net_socket **_acceptedSocket) 444 { 445 TRACE("Accept()"); 446 447 MutexLocker locker(fLock); 448 449 status_t status; 450 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 451 452 do { 453 locker.Unlock(); 454 455 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT 456 | B_CAN_INTERRUPT, timeout); 457 if (status < B_OK) 458 return status; 459 460 locker.Lock(); 461 status = gSocketModule->dequeue_connected(socket, _acceptedSocket); 462 if (status == B_OK) 463 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol); 464 } while (status < B_OK); 465 466 return status; 467 } 468 469 470 status_t 471 TCPEndpoint::Bind(const sockaddr *address) 472 { 473 if (address == NULL) 474 return B_BAD_VALUE; 475 476 MutexLocker lock(fLock); 477 478 TRACE("Bind() on address %s", PrintAddress(address)); 479 480 if (fState != CLOSED) 481 return EISCONN; 482 483 return fManager->Bind(this, address); 484 } 485 486 487 status_t 488 TCPEndpoint::Unbind(struct sockaddr *address) 489 { 490 TRACE("Unbind()"); 491 492 MutexLocker lock(fLock); 493 return fManager->Unbind(this); 494 } 495 496 497 status_t 498 TCPEndpoint::Listen(int count) 499 { 500 TRACE("Listen()"); 501 502 MutexLocker lock(fLock); 503 504 if (fState != CLOSED) 505 return B_BAD_VALUE; 506 507 fAcceptSemaphore = create_sem(0, "tcp accept"); 508 if (fAcceptSemaphore < B_OK) 509 return ENOBUFS; 510 511 status_t status = fManager->SetPassive(this); 512 if (status < B_OK) { 513 delete_sem(fAcceptSemaphore); 514 fAcceptSemaphore = -1; 515 return status; 516 } 517 518 fState = LISTEN; 519 return B_OK; 520 } 521 522 523 status_t 524 TCPEndpoint::Shutdown(int direction) 525 { 526 TRACE("Shutdown(%i)", direction); 527 528 MutexLocker lock(fLock); 529 530 if (direction == SHUT_RD || direction == SHUT_RDWR) 531 fFlags |= FLAG_NO_RECEIVE; 532 533 if (direction == SHUT_WR || direction == SHUT_RDWR) 534 _ShutdownEgress(false); 535 536 return B_OK; 537 } 538 539 540 /*! 541 Puts data contained in \a buffer into send buffer 542 */ 543 status_t 544 TCPEndpoint::SendData(net_buffer *buffer) 545 { 546 MutexLocker lock(fLock); 547 548 TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]", 549 buffer, buffer->size, buffer->flags, fSendQueue.Size(), 550 fSendQueue.Free()); 551 552 if (fState == CLOSED) 553 return ENOTCONN; 554 else if (fState == LISTEN) { 555 return EDESTADDRREQ; 556 } else if (fState == FINISH_SENT || fState == FINISH_ACKNOWLEDGED 557 || fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 558 || fState == TIME_WAIT) { 559 // TODO: send SIGPIPE signal to app? 560 return EPIPE; 561 } 562 563 if (buffer->size > 0) { 564 if (buffer->size > fSendQueue.Size()) 565 return EMSGSIZE; 566 567 bigtime_t timeout = absolute_timeout(socket->send.timeout); 568 569 while (fSendQueue.Free() < buffer->size) { 570 status_t status = fSendList.Wait(lock, timeout); 571 if (status < B_OK) { 572 TRACE(" SendData() returning %s (%d)", 573 strerror(posix_error(status)), (int)posix_error(status)); 574 return posix_error(status); 575 } 576 } 577 578 fSendQueue.Add(buffer); 579 } 580 581 TRACE(" SendData(): %lu bytes used.", fSendQueue.Used()); 582 583 if (fState == ESTABLISHED || fState == FINISH_RECEIVED) 584 _SendQueued(); 585 586 return B_OK; 587 } 588 589 590 ssize_t 591 TCPEndpoint::SendAvailable() 592 { 593 MutexLocker locker(fLock); 594 595 ssize_t available; 596 597 if (is_writable(fState)) 598 available = fSendQueue.Free(); 599 else 600 available = EPIPE; 601 602 TRACE("SendAvailable(): %li", available); 603 return available; 604 } 605 606 607 status_t 608 TCPEndpoint::FillStat(net_stat *stat) 609 { 610 MutexLocker _(fLock); 611 612 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state)); 613 stat->receive_queue_size = fReceiveQueue.Available(); 614 stat->send_queue_size = fSendQueue.Used(); 615 616 return B_OK; 617 } 618 619 620 status_t 621 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer) 622 { 623 TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags); 624 625 MutexLocker locker(fLock); 626 627 *_buffer = NULL; 628 629 if (fState == CLOSED) 630 return ENOTCONN; 631 632 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 633 634 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) { 635 if (flags & MSG_DONTWAIT) 636 return B_WOULD_BLOCK; 637 638 status_t status = _WaitForEstablished(locker, timeout); 639 if (status < B_OK) 640 return posix_error(status); 641 } 642 643 size_t dataNeeded = socket->receive.low_water_mark; 644 645 // When MSG_WAITALL is set then the function should block 646 // until the full amount of data can be returned. 647 if (flags & MSG_WAITALL) 648 dataNeeded = numBytes; 649 650 // TODO: add support for urgent data (MSG_OOB) 651 652 while (true) { 653 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 654 || fState == TIME_WAIT) { 655 // ``Connection closing''. 656 return B_OK; 657 } 658 659 if (fReceiveQueue.Available() > 0) { 660 if (fReceiveQueue.Available() >= dataNeeded || 661 ((fReceiveQueue.PushedData() > 0) 662 && (fReceiveQueue.PushedData() >= fReceiveQueue.Available()))) 663 break; 664 } else if (fState == FINISH_RECEIVED) { 665 // ``If no text is awaiting delivery, the RECEIVE will 666 // get a Connection closing''. 667 return B_OK; 668 } 669 670 if ((flags & MSG_DONTWAIT) || (socket->receive.timeout == 0)) 671 return B_WOULD_BLOCK; 672 673 status_t status = fReceiveList.Wait(locker, timeout, false); 674 if (status < B_OK) { 675 // The Open Group base specification mentions that EINTR should be 676 // returned if the recv() is interrupted before _any data_ is 677 // available. So we actually check if there is data, and if so, 678 // push it to the user. 679 if ((status == B_TIMED_OUT || status == B_INTERRUPTED) 680 && fReceiveQueue.Available() > 0) 681 break; 682 683 return posix_error(status); 684 } 685 } 686 687 TRACE(" ReadData(): %lu are available.", fReceiveQueue.Available()); 688 689 if (numBytes < fReceiveQueue.Available()) 690 fReceiveList.Signal(); 691 692 bool clone = (flags & MSG_PEEK); 693 694 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer); 695 696 TRACE(" ReadData(): %lu bytes kept.", fReceiveQueue.Available()); 697 698 // if we are opening the window, check if we should send an ACK 699 if (!clone) 700 SendAcknowledge(false); 701 702 return receivedBytes; 703 } 704 705 706 ssize_t 707 TCPEndpoint::ReadAvailable() 708 { 709 MutexLocker locker(fLock); 710 711 TRACE("ReadAvailable(): %li", _AvailableData()); 712 713 return _AvailableData(); 714 } 715 716 717 status_t 718 TCPEndpoint::SetSendBufferSize(size_t length) 719 { 720 MutexLocker _(fLock); 721 fSendQueue.SetMaxBytes(length); 722 return B_OK; 723 } 724 725 726 status_t 727 TCPEndpoint::SetReceiveBufferSize(size_t length) 728 { 729 MutexLocker _(fLock); 730 fReceiveQueue.SetMaxBytes(length); 731 return B_OK; 732 } 733 734 735 status_t 736 TCPEndpoint::SetOption(int option, const void *_value, int length) 737 { 738 if (option != TCP_NODELAY) 739 return B_BAD_VALUE; 740 741 if (length != sizeof(int)) 742 return B_BAD_VALUE; 743 744 const int *value = (const int *)_value; 745 746 MutexLocker _(fLock); 747 if (*value) 748 fOptions |= TCP_NODELAY; 749 else 750 fOptions &= ~TCP_NODELAY; 751 752 return B_OK; 753 } 754 755 756 // #pragma mark - misc 757 758 759 bool 760 TCPEndpoint::IsBound() const 761 { 762 return !LocalAddress().IsEmpty(true); 763 } 764 765 766 void 767 TCPEndpoint::DeleteSocket() 768 { 769 // the next call will delete `this'. 770 gSocketModule->delete_socket(socket); 771 } 772 773 774 status_t 775 TCPEndpoint::DelayedAcknowledge() 776 { 777 if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) { 778 // timer was active, send an ACK now (with the exception above, 779 // we send every other ACK) 780 return SendAcknowledge(true); 781 } 782 783 gStackModule->set_timer(&fDelayedAcknowledgeTimer, TCP_DELAYED_ACKNOWLEDGE_TIMEOUT); 784 return B_OK; 785 } 786 787 788 status_t 789 TCPEndpoint::SendAcknowledge(bool force) 790 { 791 return _SendQueued(force, 0); 792 } 793 794 795 void 796 TCPEndpoint::_StartPersistTimer() 797 { 798 gStackModule->set_timer(&fPersistTimer, 1000000LL); 799 } 800 801 802 void 803 TCPEndpoint::_EnterTimeWait() 804 { 805 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1); 806 } 807 808 809 status_t 810 TCPEndpoint::UpdateTimeWait() 811 { 812 return B_OK; 813 } 814 815 816 // #pragma mark - receive 817 818 819 int32 820 TCPEndpoint::_ListenReceive(tcp_segment_header &segment, net_buffer *buffer) 821 { 822 TRACE("ListenReceive()"); 823 824 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state, 825 // but the error behaviour differs 826 if (segment.flags & TCP_FLAG_RESET) 827 return DROP; 828 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 829 return DROP | RESET; 830 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 831 return DROP; 832 833 // TODO: drop broadcast/multicast 834 835 // spawn new endpoint for accept() 836 net_socket *newSocket; 837 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) 838 return DROP; 839 840 return ((TCPEndpoint *)newSocket->first_protocol)->Spawn(this, 841 segment, buffer); 842 } 843 844 845 int32 846 TCPEndpoint::Spawn(TCPEndpoint *parent, tcp_segment_header &segment, 847 net_buffer *buffer) 848 { 849 MutexLocker _(fLock); 850 851 // TODO error checking 852 ProtocolSocket::Open(); 853 854 fState = SYNCHRONIZE_RECEIVED; 855 fManager = parent->fManager; 856 857 LocalAddress().SetTo(buffer->destination); 858 PeerAddress().SetTo(buffer->source); 859 860 TRACE("Spawn()"); 861 862 // TODO: proper error handling! 863 if (fManager->BindChild(this) < B_OK) 864 return DROP; 865 866 if (_PrepareSendPath(*PeerAddress()) < B_OK) 867 return DROP; 868 869 fOptions = parent->fOptions; 870 fAcceptSemaphore = parent->fAcceptSemaphore; 871 872 _PrepareReceivePath(segment); 873 874 // send SYN+ACK 875 if (_SendQueued() < B_OK) 876 return DROP; 877 878 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 879 // we handled this flag now, it must not be set for further processing 880 881 return _Receive(segment, buffer); 882 } 883 884 885 void 886 TCPEndpoint::DumpInternalState() const 887 { 888 kprintf("Lock: { sem: %ld, holder: %ld }\n", fLock.sem, fLock.holder); 889 kprintf("AcceptSem: %ld\n", fAcceptSemaphore); 890 kprintf("Options: 0x%lx\n", (uint32)fOptions); 891 kprintf("SendWindowShift: %lu\n", (uint32)fSendWindowShift); 892 kprintf("ReceiveWindowShift: %lu\n", (uint32)fReceiveWindowShift); 893 kprintf("SendUnacknowledged: %lu\n", (uint32)fSendUnacknowledged); 894 kprintf("SendNext: %lu\n", (uint32)fSendNext); 895 kprintf("SendMax: %lu\n", (uint32)fSendMax); 896 kprintf("SendWindow: %lu\n", fSendWindow); 897 kprintf("SendMaxWindow: %lu\n", fSendMaxWindow); 898 kprintf("SendMaxSegmentSize: %lu\n", fSendMaxSegmentSize); 899 kprintf("Send-Q: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size()); 900 kprintf("LastAcknowledgeSent: %lu\n", (uint32)fLastAcknowledgeSent); 901 kprintf("InitialSendSequence: %lu\n", (uint32)fInitialSendSequence); 902 kprintf("DuplicateAcknowledgeCount: %lu\n", fDuplicateAcknowledgeCount); 903 kprintf("ReceiveNext: %lu\n", (uint32)fReceiveNext); 904 kprintf("ReceiveMaxAdvertised: %lu\n", (uint32)fReceiveMaxAdvertised); 905 kprintf("ReceiveWindow: %lu\n", (uint32)fReceiveWindow); 906 kprintf("ReceiveMaxSegmentSize: %lu\n", (uint32)fReceiveMaxSegmentSize); 907 kprintf("Recv-Q: %lu / %lu\n", fReceiveQueue.Available(), 908 fReceiveQueue.Size()); 909 kprintf("InitialReceiveSequence: %lu\n", (uint32)fInitialReceiveSequence); 910 kprintf("RoundTripTime: %ld (dev %ld)\n", fRoundTripTime, 911 fRoundTripDeviation); 912 kprintf("RetransmitTimeout: %llu\n", (uint64)fRetransmitTimeout); 913 kprintf("CongestionWindow: %lu\n", fCongestionWindow); 914 kprintf("SlowStartThreshold: %lu\n", fSlowStartThreshold); 915 kprintf("State: %s\n", name_for_state(fState)); 916 kprintf("Flags: 0x%lx\n", fFlags); 917 } 918 919 920 int32 921 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, net_buffer *buffer) 922 { 923 TRACE("SynchronizeSentReceive()"); 924 925 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 926 && (fInitialSendSequence >= segment.acknowledge 927 || fSendMax < segment.acknowledge)) 928 return DROP | RESET; 929 930 if (segment.flags & TCP_FLAG_RESET) { 931 fError = ECONNREFUSED; 932 fState = CLOSED; 933 return DROP; 934 } 935 936 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 937 return DROP; 938 939 fSendUnacknowledged = segment.acknowledge; 940 _PrepareReceivePath(segment); 941 942 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 943 _MarkEstablished(); 944 } else { 945 // simultaneous open 946 fState = SYNCHRONIZE_RECEIVED; 947 } 948 949 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 950 // we handled this flag now, it must not be set for further processing 951 952 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE; 953 } 954 955 956 int32 957 TCPEndpoint::SegmentReceived(tcp_segment_header &segment, net_buffer *buffer) 958 { 959 MutexLocker locker(fLock); 960 961 TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s", 962 buffer, buffer->size, PrintAddress(buffer->source), 963 PrintAddress(buffer->destination)); 964 TRACE(" flags 0x%x, seq %lu, ack %lu, wnd %lu", 965 segment.flags, segment.sequence, segment.acknowledge, 966 (uint32)segment.advertised_window << fSendWindowShift); 967 968 int32 segmentAction = DROP; 969 970 switch (fState) { 971 case LISTEN: 972 segmentAction = _ListenReceive(segment, buffer); 973 break; 974 975 case SYNCHRONIZE_SENT: 976 segmentAction = _SynchronizeSentReceive(segment, buffer); 977 break; 978 979 case SYNCHRONIZE_RECEIVED: 980 case ESTABLISHED: 981 case FINISH_RECEIVED: 982 case WAIT_FOR_FINISH_ACKNOWLEDGE: 983 case FINISH_SENT: 984 case FINISH_ACKNOWLEDGED: 985 case CLOSING: 986 case TIME_WAIT: 987 case CLOSED: 988 segmentAction = _SegmentReceived(segment, buffer); 989 break; 990 } 991 992 // process acknowledge action as asked for by the *Receive() method 993 if (segmentAction & IMMEDIATE_ACKNOWLEDGE) 994 SendAcknowledge(true); 995 else if (segmentAction & ACKNOWLEDGE) 996 DelayedAcknowledge(); 997 998 return segmentAction; 999 } 1000 1001 int32 1002 TCPEndpoint::_SegmentReceived(tcp_segment_header &segment, net_buffer *buffer) 1003 { 1004 uint32 advertisedWindow = (uint32)segment.advertised_window << fSendWindowShift; 1005 1006 // First, handle the most common case for uni-directional data transfer 1007 // (known as header prediction - the segment must not change the window, 1008 // and must be the expected sequence, and contain no control flags) 1009 1010 if (fState == ESTABLISHED 1011 && segment.AcknowledgeOnly() 1012 && fReceiveNext == segment.sequence 1013 && advertisedWindow > 0 && advertisedWindow == fSendWindow 1014 && fSendNext == fSendMax) { 1015 1016 _UpdateTimestamps(segment, buffer->size); 1017 1018 if (buffer->size == 0) { 1019 // this is a pure acknowledge segment - we're on the sending end 1020 if (fSendUnacknowledged < segment.acknowledge 1021 && fSendMax >= segment.acknowledge) { 1022 _Acknowledged(segment); 1023 return DROP; 1024 } 1025 } else if (segment.acknowledge == fSendUnacknowledged 1026 && fReceiveQueue.IsContiguous() 1027 && fReceiveQueue.Free() >= buffer->size 1028 && !(fFlags & FLAG_NO_RECEIVE)) { 1029 if (_AddData(segment, buffer)) 1030 _NotifyReader(); 1031 return KEEP | ((segment.flags & TCP_FLAG_PUSH) ? 1032 IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE); 1033 } 1034 } 1035 1036 // The fast path was not applicable, so we continue with the standard 1037 // processing of the incoming segment 1038 1039 if (fState != SYNCHRONIZE_SENT && fState != LISTEN && fState != CLOSED) { 1040 // 1. check sequence number 1041 if (!segment_in_sequence(segment, buffer->size, fReceiveNext, 1042 fReceiveWindow)) { 1043 TRACE(" Receive(): segment out of window, next: %lu wnd: %lu", 1044 (uint32)fReceiveNext, fReceiveWindow); 1045 if (segment.flags & TCP_FLAG_RESET) 1046 return DROP; 1047 return DROP | IMMEDIATE_ACKNOWLEDGE; 1048 } 1049 } 1050 1051 return _Receive(segment, buffer); 1052 } 1053 1054 1055 // #pragma mark - send 1056 1057 1058 inline uint8 1059 TCPEndpoint::_CurrentFlags() 1060 { 1061 // we don't set FLAG_FINISH here, instead we do it 1062 // conditionally below depending if we are sending 1063 // the last bytes of the send queue. 1064 1065 switch (fState) { 1066 case CLOSED: 1067 return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE; 1068 1069 case SYNCHRONIZE_SENT: 1070 return TCP_FLAG_SYNCHRONIZE; 1071 case SYNCHRONIZE_RECEIVED: 1072 return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE; 1073 1074 case ESTABLISHED: 1075 case FINISH_RECEIVED: 1076 case FINISH_ACKNOWLEDGED: 1077 case TIME_WAIT: 1078 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1079 case FINISH_SENT: 1080 case CLOSING: 1081 return TCP_FLAG_ACKNOWLEDGE; 1082 1083 default: 1084 return B_ERROR; 1085 } 1086 } 1087 1088 1089 inline bool 1090 TCPEndpoint::_ShouldSendSegment(tcp_segment_header &segment, uint32 length, 1091 uint32 segmentMaxSize, uint32 flightSize) 1092 { 1093 if (length > 0) { 1094 // Avoid the silly window syndrome - we only send a segment in case: 1095 // - we have a full segment to send, or 1096 // - we're at the end of our buffer queue, or 1097 // - the buffer is at least larger than half of the maximum send window, or 1098 // - we're retransmitting data 1099 if (length == segmentMaxSize 1100 || (fOptions & TCP_NODELAY) != 0 1101 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence() 1102 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2)) 1103 return true; 1104 } 1105 1106 // check if we need to send a window update to the peer 1107 if (segment.advertised_window > 0) { 1108 // correct the window to take into account what already has been advertised 1109 uint32 window = (segment.advertised_window << fReceiveWindowShift) 1110 - (fReceiveMaxAdvertised - fReceiveNext); 1111 1112 // if we can advertise a window larger than twice the maximum segment 1113 // size, or half the maximum buffer size we send a window update 1114 if (window >= (fReceiveMaxSegmentSize << 1) 1115 || window >= (socket->receive.buffer_size >> 1)) 1116 return true; 1117 } 1118 1119 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH | TCP_FLAG_RESET)) != 0) 1120 return true; 1121 1122 // there is no reason to send a segment just now 1123 return false; 1124 } 1125 1126 1127 status_t 1128 TCPEndpoint::_SendQueued(bool force) 1129 { 1130 return _SendQueued(force, fSendWindow); 1131 } 1132 1133 1134 /*! 1135 Sends one or more TCP segments with the data waiting in the queue, or some 1136 specific flags that need to be sent. 1137 */ 1138 status_t 1139 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow) 1140 { 1141 if (fRoute == NULL) 1142 return B_ERROR; 1143 1144 // in passive state? 1145 if (fState == LISTEN) 1146 return B_ERROR; 1147 1148 tcp_segment_header segment(_CurrentFlags()); 1149 1150 if ((fOptions & TCP_NOOPT) == 0) { 1151 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1152 segment.options |= TCP_HAS_TIMESTAMPS; 1153 segment.timestamp_reply = fReceivedTimestamp; 1154 segment.timestamp_value = tcp_now(); 1155 } 1156 1157 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) 1158 && (fSendNext == fInitialSendSequence)) { 1159 // add connection establishment options 1160 segment.max_segment_size = fReceiveMaxSegmentSize; 1161 if (fFlags & FLAG_OPTION_WINDOW_SCALE) { 1162 segment.options |= TCP_HAS_WINDOW_SCALE; 1163 segment.window_shift = fReceiveWindowShift; 1164 } 1165 } 1166 } 1167 1168 size_t availableBytes = fReceiveQueue.Free(); 1169 if (fFlags & FLAG_OPTION_WINDOW_SCALE) 1170 segment.advertised_window = availableBytes >> fReceiveWindowShift; 1171 else 1172 segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes); 1173 1174 segment.acknowledge = fReceiveNext; 1175 segment.urgent_offset = 0; 1176 1177 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow) 1178 sendWindow = fCongestionWindow; 1179 1180 // SND.UNA SND.NXT SND.MAX 1181 // | | | 1182 // v v v 1183 // ----------------------------------- 1184 // | effective window | 1185 // ----------------------------------- 1186 1187 // Flight size represents the window of data which is currently in the 1188 // ether. We should never send data such as the flight size becomes larger 1189 // than the effective window. Note however that the effective window may be 1190 // reduced (by congestion for instance), so at some point in time flight 1191 // size may be larger than the currently calculated window. 1192 1193 uint32 flightSize = fSendMax - fSendUnacknowledged; 1194 uint32 consumedWindow = fSendNext - fSendUnacknowledged; 1195 1196 if (consumedWindow > sendWindow) { 1197 sendWindow = 0; 1198 // TODO enter persist state? try to get a window update. 1199 } else 1200 sendWindow -= consumedWindow; 1201 1202 if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) { 1203 // send one byte of data to ask for a window update 1204 // (triggered by the persist timer) 1205 sendWindow = 1; 1206 } 1207 1208 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow); 1209 tcp_sequence previousSendNext = fSendNext; 1210 1211 do { 1212 uint32 segmentMaxSize = fSendMaxSegmentSize 1213 - tcp_options_length(segment); 1214 uint32 segmentLength = min_c(length, segmentMaxSize); 1215 1216 if (fSendNext + segmentLength == fSendQueue.LastSequence()) { 1217 if (state_needs_finish(fState)) 1218 segment.flags |= TCP_FLAG_FINISH; 1219 if (length > 0) 1220 segment.flags |= TCP_FLAG_PUSH; 1221 } 1222 1223 // Determine if we should really send this segment 1224 if (!force && !_ShouldSendSegment(segment, segmentLength, 1225 segmentMaxSize, flightSize)) { 1226 if (fSendQueue.Available() 1227 && !gStackModule->is_timer_active(&fPersistTimer) 1228 && !gStackModule->is_timer_active(&fRetransmitTimer)) 1229 _StartPersistTimer(); 1230 break; 1231 } 1232 1233 net_buffer *buffer = gBufferModule->create(256); 1234 if (buffer == NULL) 1235 return B_NO_MEMORY; 1236 1237 status_t status = B_OK; 1238 if (segmentLength > 0) 1239 status = fSendQueue.Get(buffer, fSendNext, segmentLength); 1240 if (status < B_OK) { 1241 gBufferModule->free(buffer); 1242 return status; 1243 } 1244 1245 LocalAddress().CopyTo(buffer->source); 1246 PeerAddress().CopyTo(buffer->destination); 1247 1248 uint32 size = buffer->size; 1249 segment.sequence = fSendNext; 1250 1251 TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s", 1252 buffer, buffer->size, PrintAddress(buffer->source), 1253 PrintAddress(buffer->destination)); 1254 TRACE(" flags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu" 1255 ", ssthresh %lu", segment.flags, segment.sequence, 1256 segment.acknowledge, segment.advertised_window, 1257 fCongestionWindow, fSlowStartThreshold); 1258 TRACE(" len %lu first %lu last %lu", segmentLength, 1259 (uint32)fSendQueue.FirstSequence(), 1260 (uint32)fSendQueue.LastSequence()); 1261 1262 PROBE(buffer, sendWindow); 1263 sendWindow -= buffer->size; 1264 1265 status = add_tcp_header(AddressModule(), segment, buffer); 1266 if (status != B_OK) { 1267 gBufferModule->free(buffer); 1268 return status; 1269 } 1270 1271 // Update send status - we need to do this before we send the data 1272 // for local connections as the answer is directly handled 1273 1274 if (segment.flags & TCP_FLAG_SYNCHRONIZE) { 1275 segment.options &= ~TCP_HAS_WINDOW_SCALE; 1276 segment.max_segment_size = 0; 1277 size++; 1278 } 1279 1280 if (segment.flags & TCP_FLAG_FINISH) 1281 size++; 1282 1283 uint32 sendMax = fSendMax; 1284 fSendNext += size; 1285 if (fSendMax < fSendNext) 1286 fSendMax = fSendNext; 1287 1288 fReceiveMaxAdvertised = fReceiveNext 1289 + ((uint32)segment.advertised_window << fReceiveWindowShift); 1290 1291 status = next->module->send_routed_data(next, fRoute, buffer); 1292 if (status < B_OK) { 1293 gBufferModule->free(buffer); 1294 1295 fSendNext = segment.sequence; 1296 fSendMax = sendMax; 1297 // restore send status 1298 return status; 1299 } 1300 1301 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 1302 fLastAcknowledgeSent = segment.acknowledge; 1303 1304 length -= segmentLength; 1305 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET | TCP_FLAG_FINISH); 1306 } while (length > 0); 1307 1308 // if we sent data from the beggining of the send queue, 1309 // start the retransmition timer 1310 if (previousSendNext == fSendUnacknowledged 1311 && fSendNext > previousSendNext) { 1312 TRACE(" SendQueue(): set retransmit timer with rto %llu", 1313 fRetransmitTimeout); 1314 1315 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 1316 } 1317 1318 return B_OK; 1319 } 1320 1321 1322 int 1323 TCPEndpoint::_MaxSegmentSize(const sockaddr *address) const 1324 { 1325 return next->module->get_mtu(next, address) - sizeof(tcp_header); 1326 } 1327 1328 1329 status_t 1330 TCPEndpoint::_ShutdownEgress(bool closing) 1331 { 1332 tcp_state previousState = fState; 1333 1334 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED) 1335 fState = FINISH_SENT; 1336 else if (fState == FINISH_RECEIVED) 1337 fState = WAIT_FOR_FINISH_ACKNOWLEDGE; 1338 else 1339 return B_OK; 1340 1341 status_t status = _SendQueued(); 1342 if (status != B_OK) { 1343 fState = previousState; 1344 return status; 1345 } 1346 1347 return B_OK; 1348 } 1349 1350 1351 ssize_t 1352 TCPEndpoint::_AvailableData() const 1353 { 1354 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding 1355 // the application of FLAG_NO_RECEIVE in listen()ing 1356 // sockets. 1357 if (fState == LISTEN) 1358 return gSocketModule->count_connected(socket); 1359 else if (fState == SYNCHRONIZE_SENT) 1360 return 0; 1361 1362 ssize_t availableData = fReceiveQueue.Available(); 1363 1364 if (availableData == 0 && !_ShouldReceive()) 1365 return ENOTCONN; 1366 1367 return availableData; 1368 } 1369 1370 1371 void 1372 TCPEndpoint::_NotifyReader() 1373 { 1374 fReceiveList.Signal(); 1375 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData()); 1376 } 1377 1378 1379 bool 1380 TCPEndpoint::_ShouldReceive() const 1381 { 1382 if (fFlags & FLAG_NO_RECEIVE) 1383 return false; 1384 1385 return fState == ESTABLISHED || fState == FINISH_SENT 1386 || fState == FINISH_ACKNOWLEDGED; 1387 } 1388 1389 1390 int32 1391 TCPEndpoint::_Receive(tcp_segment_header &segment, net_buffer *buffer) 1392 { 1393 uint32 advertisedWindow = (uint32)segment.advertised_window << fSendWindowShift; 1394 1395 size_t segmentLength = buffer->size; 1396 1397 if (segment.flags & TCP_FLAG_RESET) { 1398 // is this a valid reset? 1399 if (fLastAcknowledgeSent <= segment.sequence 1400 && tcp_sequence(segment.sequence) 1401 < (fLastAcknowledgeSent + fReceiveWindow)) { 1402 if (fState == SYNCHRONIZE_RECEIVED) 1403 fError = ECONNREFUSED; 1404 else if (fState == CLOSING || fState == TIME_WAIT 1405 || fState == WAIT_FOR_FINISH_ACKNOWLEDGE) 1406 fError = ENOTCONN; 1407 else 1408 fError = ECONNRESET; 1409 1410 _NotifyReader(); 1411 fState = CLOSED; 1412 } 1413 1414 return DROP; 1415 } 1416 1417 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1418 || (fState == SYNCHRONIZE_RECEIVED 1419 && (fInitialReceiveSequence > segment.sequence 1420 || (segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1421 && (fSendUnacknowledged > segment.acknowledge 1422 || fSendMax < segment.acknowledge)))) { 1423 // reset the connection - either the initial SYN was faulty, or we 1424 // received a SYN within the data stream 1425 return DROP | RESET; 1426 } 1427 1428 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow); 1429 // the window must not shrink 1430 1431 // trim buffer to be within the receive window 1432 int32 drop = fReceiveNext - segment.sequence; 1433 if (drop > 0) { 1434 if ((uint32)drop > buffer->size 1435 || ((uint32)drop == buffer->size 1436 && (segment.flags & TCP_FLAG_FINISH) == 0)) { 1437 // don't accidently remove a FIN we shouldn't remove 1438 segment.flags &= ~TCP_FLAG_FINISH; 1439 drop = buffer->size; 1440 } 1441 1442 // remove duplicate data at the start 1443 TRACE("* remove %ld bytes from the start", drop); 1444 gBufferModule->remove_header(buffer, drop); 1445 segment.sequence += drop; 1446 } 1447 1448 int32 action = KEEP; 1449 1450 drop = segment.sequence + buffer->size - (fReceiveNext + fReceiveWindow); 1451 if (drop > 0) { 1452 // remove data exceeding our window 1453 if ((uint32)drop >= buffer->size) { 1454 // if we can accept data, or the segment is not what we'd expect, 1455 // drop the segment (an immediate acknowledge is always triggered) 1456 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext) 1457 return DROP | IMMEDIATE_ACKNOWLEDGE; 1458 1459 action |= IMMEDIATE_ACKNOWLEDGE; 1460 } 1461 1462 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1463 // we need to remove the finish, too, as part of the data 1464 drop--; 1465 } 1466 1467 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH); 1468 TRACE("* remove %ld bytes from the end", drop); 1469 gBufferModule->remove_trailer(buffer, drop); 1470 } 1471 1472 if (advertisedWindow > fSendWindow) 1473 TRACE(" Receive(): Window update %lu -> %lu", fSendWindow, 1474 advertisedWindow); 1475 1476 fSendWindow = advertisedWindow; 1477 if (advertisedWindow > fSendMaxWindow) 1478 fSendMaxWindow = advertisedWindow; 1479 1480 // Then look at the acknowledgement for any updates 1481 1482 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) { 1483 // process acknowledged data 1484 if (fState == SYNCHRONIZE_RECEIVED) 1485 _MarkEstablished(); 1486 1487 if (fSendMax < segment.acknowledge || fState == TIME_WAIT) 1488 return DROP | IMMEDIATE_ACKNOWLEDGE; 1489 1490 if (segment.acknowledge < fSendUnacknowledged) { 1491 if (buffer->size == 0 && advertisedWindow == fSendWindow 1492 && (segment.flags & TCP_FLAG_FINISH) == 0) { 1493 TRACE("Receive(): duplicate ack!"); 1494 1495 _DuplicateAcknowledge(segment); 1496 } 1497 1498 return DROP; 1499 } else { 1500 // this segment acknowledges in flight data 1501 1502 if (fDuplicateAcknowledgeCount >= 3) { 1503 // deflate the window. 1504 fCongestionWindow = fSlowStartThreshold; 1505 } 1506 1507 fDuplicateAcknowledgeCount = 0; 1508 1509 if (fSendMax == segment.acknowledge) 1510 TRACE("Receive(): all inflight data ack'd!"); 1511 1512 if (segment.acknowledge > fSendQueue.LastSequence() 1513 && fState > ESTABLISHED) { 1514 TRACE("Receive(): FIN has been acknowledged!"); 1515 1516 switch (fState) { 1517 case FINISH_SENT: 1518 fState = FINISH_ACKNOWLEDGED; 1519 break; 1520 case CLOSING: 1521 fState = TIME_WAIT; 1522 _EnterTimeWait(); 1523 return DROP; 1524 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1525 fState = CLOSED; 1526 break; 1527 1528 default: 1529 break; 1530 } 1531 } 1532 1533 if (fState != CLOSED) 1534 _Acknowledged(segment); 1535 } 1536 } 1537 1538 if (segment.flags & TCP_FLAG_URGENT) { 1539 if (fState == ESTABLISHED || fState == FINISH_SENT 1540 || fState == FINISH_ACKNOWLEDGED) { 1541 // TODO: Handle urgent data: 1542 // - RCV.UP <- max(RCV.UP, SEG.UP) 1543 // - signal the user that urgent data is available (SIGURG) 1544 } 1545 } 1546 1547 bool notify = false; 1548 1549 if (buffer->size > 0 && _ShouldReceive()) 1550 notify = _AddData(segment, buffer); 1551 else 1552 action = (action & ~KEEP) | DROP; 1553 1554 if (segment.flags & TCP_FLAG_FINISH) { 1555 segmentLength++; 1556 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) { 1557 TRACE("Receive(): peer is finishing connection!"); 1558 fReceiveNext++; 1559 notify = true; 1560 1561 // FIN implies PSH 1562 fReceiveQueue.SetPushPointer(); 1563 1564 // we'll reply immediatly to the FIN if we are not 1565 // transitioning to TIME WAIT so we immediatly ACK it. 1566 action |= IMMEDIATE_ACKNOWLEDGE; 1567 1568 // other side is closing connection; change states 1569 switch (fState) { 1570 case ESTABLISHED: 1571 case SYNCHRONIZE_RECEIVED: 1572 fState = FINISH_RECEIVED; 1573 break; 1574 case FINISH_SENT: 1575 // simultaneous close 1576 fState = CLOSING; 1577 break; 1578 case FINISH_ACKNOWLEDGED: 1579 fState = TIME_WAIT; 1580 _EnterTimeWait(); 1581 break; 1582 default: 1583 break; 1584 } 1585 } 1586 } 1587 1588 if (notify) 1589 _NotifyReader(); 1590 1591 if (buffer->size > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0) 1592 action |= ACKNOWLEDGE; 1593 1594 _UpdateTimestamps(segment, segmentLength); 1595 1596 TRACE("Receive() Action %ld", action); 1597 1598 return action; 1599 } 1600 1601 1602 void 1603 TCPEndpoint::_UpdateTimestamps(tcp_segment_header &segment, 1604 size_t segmentLength) 1605 { 1606 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1607 tcp_sequence sequence(segment.sequence); 1608 1609 if ((fLastAcknowledgeSent >= sequence 1610 && fLastAcknowledgeSent < (sequence + segmentLength))) 1611 fReceivedTimestamp = segment.timestamp_value; 1612 } 1613 } 1614 1615 1616 void 1617 TCPEndpoint::_MarkEstablished() 1618 { 1619 fState = ESTABLISHED; 1620 1621 if (socket->parent != NULL) { 1622 gSocketModule->set_connected(socket); 1623 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE); 1624 } 1625 1626 fSendList.Signal(); 1627 } 1628 1629 1630 status_t 1631 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout) 1632 { 1633 while (fState != ESTABLISHED) { 1634 status_t status = fSendList.Wait(locker, timeout); 1635 if (status < B_OK) 1636 return status; 1637 } 1638 1639 return B_OK; 1640 } 1641 1642 1643 bool 1644 TCPEndpoint::_AddData(tcp_segment_header &segment, net_buffer *buffer) 1645 { 1646 fReceiveQueue.Add(buffer, segment.sequence); 1647 fReceiveNext = fReceiveQueue.NextSequence(); 1648 1649 TRACE(" _AddData(): adding data, receive next = %lu. Now have %lu bytes.", 1650 (uint32)fReceiveNext, fReceiveQueue.Available()); 1651 1652 if (segment.flags & TCP_FLAG_PUSH) 1653 fReceiveQueue.SetPushPointer(); 1654 1655 return fReceiveQueue.Available() > 0; 1656 } 1657 1658 1659 void 1660 TCPEndpoint::_PrepareReceivePath(tcp_segment_header &segment) 1661 { 1662 fInitialReceiveSequence = segment.sequence; 1663 1664 // count the received SYN 1665 segment.sequence++; 1666 1667 fReceiveNext = segment.sequence; 1668 fReceiveQueue.SetInitialSequence(segment.sequence); 1669 1670 if ((fOptions & TCP_NOOPT) == 0) { 1671 if (segment.max_segment_size > 0) 1672 fSendMaxSegmentSize = segment.max_segment_size; 1673 1674 if (segment.options & TCP_HAS_WINDOW_SCALE) { 1675 fFlags |= FLAG_OPTION_WINDOW_SCALE; 1676 fSendWindowShift = segment.window_shift; 1677 } else { 1678 fFlags &= ~FLAG_OPTION_WINDOW_SCALE; 1679 fReceiveWindowShift = 0; 1680 } 1681 1682 if (segment.options & TCP_HAS_TIMESTAMPS) { 1683 fFlags |= FLAG_OPTION_TIMESTAMP; 1684 fReceivedTimestamp = segment.timestamp_value; 1685 } else 1686 fFlags &= ~FLAG_OPTION_TIMESTAMP; 1687 } 1688 1689 fCongestionWindow = 2 * fSendMaxSegmentSize; 1690 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift; 1691 } 1692 1693 1694 status_t 1695 TCPEndpoint::_PrepareSendPath(const sockaddr *peer) 1696 { 1697 if (fRoute == NULL) { 1698 fRoute = gDatalinkModule->get_route(Domain(), peer); 1699 if (fRoute == NULL) 1700 return ENETUNREACH; 1701 } 1702 1703 // make sure connection does not already exist 1704 status_t status = fManager->SetConnection(this, *LocalAddress(), peer, 1705 fRoute->interface->address); 1706 if (status < B_OK) 1707 return status; 1708 1709 fInitialSendSequence = system_time() >> 4; 1710 fSendNext = fInitialSendSequence; 1711 fSendUnacknowledged = fInitialSendSequence; 1712 fSendMax = fInitialSendSequence; 1713 1714 // we are counting the SYN here 1715 fSendQueue.SetInitialSequence(fSendNext + 1); 1716 1717 fReceiveMaxSegmentSize = _MaxSegmentSize(peer); 1718 1719 // Compute the window shift we advertise to our peer - if it doesn't support 1720 // this option, this will be reset to 0 (when its SYN is received) 1721 fReceiveWindowShift = 0; 1722 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT 1723 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) { 1724 fReceiveWindowShift++; 1725 } 1726 1727 return B_OK; 1728 } 1729 1730 1731 void 1732 TCPEndpoint::_Acknowledged(tcp_segment_header &segment) 1733 { 1734 size_t previouslyUsed = fSendQueue.Used(); 1735 1736 fSendQueue.RemoveUntil(segment.acknowledge); 1737 fSendUnacknowledged = segment.acknowledge; 1738 1739 if (fSendNext < fSendUnacknowledged) 1740 fSendNext = fSendUnacknowledged; 1741 1742 if (fSendUnacknowledged == fSendMax) 1743 gStackModule->cancel_timer(&fRetransmitTimer); 1744 1745 if (fSendQueue.Used() < previouslyUsed) { 1746 // this ACK acknowledged data 1747 1748 if (segment.options & TCP_HAS_TIMESTAMPS) 1749 _UpdateSRTT(tcp_diff_timestamp(segment.timestamp_reply)); 1750 else { 1751 // TODO Fallback to RFC 793 type estimation 1752 } 1753 1754 if (is_writable(fState)) { 1755 // notify threads waiting on the socket to become writable again 1756 fSendList.Signal(); 1757 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used()); 1758 } 1759 1760 if (fCongestionWindow < fSlowStartThreshold) 1761 fCongestionWindow += fSendMaxSegmentSize; 1762 } 1763 1764 if (fCongestionWindow >= fSlowStartThreshold) { 1765 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize; 1766 1767 if (increment < fCongestionWindow) 1768 increment = 1; 1769 else 1770 increment /= fCongestionWindow; 1771 1772 fCongestionWindow += increment; 1773 } 1774 1775 // if there is data left to be send, send it now 1776 if (fSendQueue.Used() > 0) 1777 _SendQueued(); 1778 } 1779 1780 1781 void 1782 TCPEndpoint::_Retransmit() 1783 { 1784 TRACE("Retransmit()"); 1785 _ResetSlowStart(); 1786 fSendNext = fSendUnacknowledged; 1787 _SendQueued(); 1788 } 1789 1790 1791 void 1792 TCPEndpoint::_UpdateSRTT(int32 roundTripTime) 1793 { 1794 int32 rtt = roundTripTime; 1795 1796 // Update_SRTT() as per Van Jacobson 1797 rtt -= (fRoundTripTime / 8); 1798 fRoundTripTime += rtt; 1799 if (rtt < 0) 1800 rtt = -rtt; 1801 rtt -= (fRoundTripDeviation / 4); 1802 fRoundTripDeviation += rtt; 1803 1804 fRetransmitTimeout = ((fRoundTripTime / 4 + 1805 fRoundTripDeviation) / 2) * kTimestampFactor; 1806 1807 TRACE(" RTO is now %llu (after rtt %ldms)", fRetransmitTimeout, 1808 roundTripTime); 1809 } 1810 1811 1812 void 1813 TCPEndpoint::_ResetSlowStart() 1814 { 1815 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged) / 2, 1816 2 * fSendMaxSegmentSize); 1817 fCongestionWindow = fSendMaxSegmentSize; 1818 } 1819 1820 1821 void 1822 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment) 1823 { 1824 fDuplicateAcknowledgeCount++; 1825 1826 if (fDuplicateAcknowledgeCount < 3) 1827 return; 1828 else if (fDuplicateAcknowledgeCount == 3) { 1829 _ResetSlowStart(); 1830 fCongestionWindow = fSlowStartThreshold + 3 1831 * fSendMaxSegmentSize; 1832 fSendNext = segment.acknowledge; 1833 } else if (fDuplicateAcknowledgeCount > 3) 1834 fCongestionWindow += fSendMaxSegmentSize; 1835 1836 _SendQueued(); 1837 } 1838 1839 1840 // #pragma mark - timer 1841 1842 1843 /*static*/ void 1844 TCPEndpoint::_RetransmitTimer(net_timer *timer, void *data) 1845 { 1846 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1847 1848 MutexLocker locker(endpoint->fLock); 1849 if (!locker.IsLocked()) 1850 return; 1851 1852 endpoint->_Retransmit(); 1853 } 1854 1855 1856 /*static*/ void 1857 TCPEndpoint::_PersistTimer(net_timer *timer, void *data) 1858 { 1859 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1860 1861 MutexLocker locker(endpoint->fLock); 1862 if (!locker.IsLocked()) 1863 return; 1864 1865 endpoint->_SendQueued(true); 1866 } 1867 1868 1869 /*static*/ void 1870 TCPEndpoint::_DelayedAcknowledgeTimer(struct net_timer *timer, void *data) 1871 { 1872 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1873 1874 MutexLocker locker(endpoint->fLock); 1875 if (!locker.IsLocked()) 1876 return; 1877 1878 endpoint->SendAcknowledge(true); 1879 } 1880 1881 1882 /*static*/ void 1883 TCPEndpoint::_TimeWaitTimer(struct net_timer *timer, void *data) 1884 { 1885 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1886 1887 if (mutex_lock(&endpoint->fLock) < B_OK) 1888 return; 1889 1890 endpoint->DeleteSocket(); 1891 } 1892 1893